| Crates.io | stateset-rl-core |
| lib.rs | stateset-rl-core |
| version | 0.1.0 |
| created_at | 2025-12-11 01:52:24.203018+00 |
| updated_at | 2025-12-11 01:52:24.203018+00 |
| description | High-performance Rust core for reinforcement learning - GAE, advantage computation, reward normalization with optional Python bindings |
| homepage | https://github.com/stateset/stateset-agents/tree/master/rust_core |
| repository | https://github.com/stateset/stateset-agents |
| max_upload_size | |
| id | 1979005 |
| size | 56,572 |
High-performance Rust implementations of reinforcement learning operations, with optional Python bindings via PyO3.
[dependencies]
stateset-rl-core = "0.1"
cd rust_core
maturin develop --release
use stateset_rl_core::{compute_gae_internal, compute_advantages_for_group};
// Compute GAE
let rewards = vec![1.0, 0.0, 1.0, 0.0];
let values = vec![0.5, 0.5, 0.5, 0.5, 0.0]; // n+1 values for bootstrap
let advantages = compute_gae_internal(&rewards, &values, 0.99, 0.95);
// Compute group-relative advantages
let group_rewards = vec![1.0, 2.0, 3.0, 4.0];
let advantages = compute_advantages_for_group(&group_rewards, "mean", true);
import numpy as np
import stateset_rl_core
# Compute GAE
rewards = np.array([1.0, 0.0, 1.0, 0.0])
values = np.array([0.5, 0.5, 0.5, 0.5, 0.0])
advantages = stateset_rl_core.compute_gae(rewards, values, gamma=0.99, gae_lambda=0.95)
# Batch GAE (parallel)
all_rewards = [np.random.randn(100) for _ in range(32)]
all_values = [np.random.randn(101) for _ in range(32)]
all_advantages = stateset_rl_core.batch_compute_gae(all_rewards, all_values)
# Group-relative advantages for GRPO
rewards_2d = np.random.randn(16, 4) # 16 groups, 4 samples each
advantages = stateset_rl_core.compute_group_advantages(rewards_2d, "mean", normalize=True)
# Reward normalization with running stats
rewards = np.array([1.0, 2.0, 3.0])
normalized, mean, var, count = stateset_rl_core.normalize_rewards(rewards)
# GSPO importance ratios
log_probs_new = np.array([-10.0, -12.0, -11.0])
log_probs_old = np.array([-10.5, -11.5, -11.0])
seq_lengths = np.array([50, 60, 55])
ratios = stateset_rl_core.compute_gspo_importance_ratios(log_probs_new, log_probs_old, seq_lengths)
# PPO surrogate objective
ratios = np.array([1.1, 0.9, 1.05])
advantages = np.array([1.0, -1.0, 0.5])
objectives = stateset_rl_core.compute_ppo_surrogate(ratios, advantages, clip_epsilon=0.2)
compute_gae(rewards, values, gamma=0.99, gae_lambda=0.95) - Single trajectory GAEbatch_compute_gae(all_rewards, all_values, gamma=0.99, gae_lambda=0.95) - Parallel batch GAEcompute_group_advantages(rewards_2d, baseline_type, normalize) - GRPO-style group advantages
baseline_type: "mean", "median", or "min"normalize_rewards(rewards, running_mean=0, running_var=1, count=0, epsilon=1e-8) - Online normalizationclip_rewards(rewards, min_val, max_val) - Reward clippingcompute_reward_statistics(rewards) - Compute mean, std, min, max, mediancompute_gspo_importance_ratios(log_probs_new, log_probs_old, sequence_lengths) - GSPO ratiosapply_gspo_clipping(ratios, advantages, clip_left=3e-4, clip_right=4e-4) - GSPO clippingcompute_ppo_surrogate(ratios, advantages, clip_epsilon=0.2) - PPO clipped objectiveThis crate is optimized for performance:
Typical speedups over pure Python/NumPy: 10-100x for batch operations.
MIT