<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2009/04/01/roofline-an-insightful-visual-performance-model-for-multicore-architectures.html</loc>
<lastmod>2009-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2010/01/01/demystifying-gpu-microarchitecture-through-microbenchmarking.html</loc>
<lastmod>2010-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2013/12/01/playing-atari-with-deep-reinforcement-learning.html</loc>
<lastmod>2013-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2013/12/01/what-makes-good-data-for-alignment-a-comprehensive-study-of-automatic-data-selection-in-instruction-tuning.html</loc>
<lastmod>2013-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2014/11/01/introducing-data-center-fabric-the-next-generation-facebook-data-center-network.html</loc>
<lastmod>2014-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2015/07/01/massively-parallel-methods-for-deep-reinforcement-learning.html</loc>
<lastmod>2015-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2015/07/01/ucx-an-open-source-framework-for-hpc-network-apis-and-beyond.html</loc>
<lastmod>2015-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2015/08/01/jupiter-rising-a-decade-of-clos-topologies-and-centralized-control-in-googles-datacenter-network.html</loc>
<lastmod>2015-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2016/01/01/mastering-the-game-of-go-with-deep-neural-networks-and-tree-search.html</loc>
<lastmod>2016-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2016/01/01/single-pass-parallel-prefix-scan-with-decoupled-look-back.html</loc>
<lastmod>2016-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2016/04/01/optimizing-performance-of-recurrent-neural-networks-on-gpus.html</loc>
<lastmod>2016-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2016/06/01/design-guidelines-for-high-performance-rdma-systems.html</loc>
<lastmod>2016-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2017/04/01/locality-aware-cta-clustering-for-modern-gpus.html</loc>
<lastmod>2017-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2017/05/01/offloading-communication-control-logic-in-gpu-accelerated-applications.html</loc>
<lastmod>2017-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2017/08/01/proximal-policy-optimization-algorithms.html</loc>
<lastmod>2017-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2017/10/01/optimizing-cache-bypassing-and-warp-scheduling-for-gpus.html</loc>
<lastmod>2017-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2017/12/01/gpu-centric-communication-on-nvidia-gpu-clusters-with-infiniband-a-case-study-with-openshmem.html</loc>
<lastmod>2017-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2017/12/01/rllib-abstractions-for-distributed-reinforcement-learning.html</loc>
<lastmod>2017-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2018/03/01/improving-real-time-performance-with-cuda-persistent-threads-cuper-on-the-jetson-tx2.html</loc>
<lastmod>2018-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2018/04/01/gpudirect-async-exploring-gpu-synchronous-communication-techniques-for-infiniband-clusters.html</loc>
<lastmod>2018-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2018/06/01/pipedream-fast-and-efficient-pipeline-parallel-dnn-training.html</loc>
<lastmod>2018-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2018/06/01/tensor-comprehensions-framework-agnostic-high-performance-machine-learning-abstractions.html</loc>
<lastmod>2018-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2018/07/01/universal-transformers.html</loc>
<lastmod>2018-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2018/10/01/tvm-an-automated-end-to-end-optimizing-compiler-for-deep-learning.html</loc>
<lastmod>2018-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2018/11/01/blockwise-parallel-decoding-for-deep-autoregressive-models.html</loc>
<lastmod>2018-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2019/01/01/transformer-xl-attentive-language-models-beyond-a-fixed-length-context.html</loc>
<lastmod>2019-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2019/07/01/gpipe-easy-scaling-with-micro-batch-pipeline-parallelism.html</loc>
<lastmod>2019-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2019/10/01/netdimm-low-latency-near-memory-network-interface-architecture.html</loc>
<lastmod>2019-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2019/10/01/seed-rl-scalable-and-efficient-deep-rl-with-accelerated-central-inference.html</loc>
<lastmod>2019-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2019/10/01/transformers-state-of-the-art-natural-language-processing.html</loc>
<lastmod>2019-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2019/11/01/fast-transformer-decoding-one-write-head-is-all-you-need.html</loc>
<lastmod>2019-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2020/01/01/scaling-laws-for-neural-language-models.html</loc>
<lastmod>2020-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2020/02/01/gpu-initiated-openshmem-correct-and-eicient-intra-kernel-networking-for-dgpus.html</loc>
<lastmod>2020-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2020/02/01/low-rank-bottleneck-in-multi-head-attention-models.html</loc>
<lastmod>2020-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2020/03/01/megatron-lm-training-multi-billion-parameter-language-models-using-model-parallelism.html</loc>
<lastmod>2020-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2020/03/01/zero-memory-optimizations-toward-training-trillion-parameter-models.html</loc>
<lastmod>2020-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2020/04/01/longformer-the-long-document-transformer.html</loc>
<lastmod>2020-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2020/06/01/gshard-scaling-giant-models-with-conditional-computation-and-automatic-sharding.html</loc>
<lastmod>2020-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2020/06/01/transformers-are-rnns-fast-autoregressive-transformers-with-linear-attention.html</loc>
<lastmod>2020-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2020/08/01/an-in-depth-analysis-of-the-slingshot-interconnect.html</loc>
<lastmod>2020-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2020/09/01/fusionstitching-boosting-memory-intensive-computations-for-deep-learning-workloads.html</loc>
<lastmod>2020-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2020/09/01/learning-to-summarize-from-human-feedback.html</loc>
<lastmod>2020-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2020/09/01/measuring-massive-multitask-language-understanding.html</loc>
<lastmod>2020-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2020/11/01/ansor-generating-high-performance-tensor-programs-for-deep-learning.html</loc>
<lastmod>2020-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2021/01/01/unit-unifying-tensorized-instruction-compilation.html</loc>
<lastmod>2021-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/01/01/zero-offload-democratizing-billion-scale-model-training.html</loc>
<lastmod>2021-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2021/02/01/c-for-metal-high-performance-simd-programming-on-intel-gpus.html</loc>
<lastmod>2021-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/02/01/checkfreq-frequent-fine-grained-dnn-checkpointing.html</loc>
<lastmod>2021-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2021/02/01/learning-associative-inference-using-fast-weight-memory.html</loc>
<lastmod>2021-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2021/02/01/mlir-scaling-compiler-infrastructure-for-domain-specific-computation.html</loc>
<lastmod>2021-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2021/02/01/progressive-raising-in-multi-level-ir.html</loc>
<lastmod>2021-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2021/04/01/roformer-enhanced-transformer-with-rotary-position-embedding.html</loc>
<lastmod>2021-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/04/01/zero-infinity-breaking-the-gpu-memory-wall-for-extreme-scale-deep-learning.html</loc>
<lastmod>2021-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2021/06/01/linear-transformers-are-secretly-fast-weight-programmers.html</loc>
<lastmod>2021-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/06/01/lora-low-rank-adaptation-of-large-language-models.html</loc>
<lastmod>2021-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/07/01/chimera-efficiently-training-large-scale-neural-networks-with-bidirectional-pipelines.html</loc>
<lastmod>2021-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2021/07/01/pet-optimizing-tensor-programs-with-partially-equivalent-transformations-and-automated-corrections.html</loc>
<lastmod>2021-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2021/08/01/dnnfusion-accelerating-deep-neural-networks-execution-with-advanced-operator-fusion.html</loc>
<lastmod>2021-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2021/08/01/efficient-large-scale-language-model-training-on-gpu-clusters-using-megatron-lm.html</loc>
<lastmod>2021-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2021/10/01/bolt-bridging-the-gap-between-auto-tuners-and-hardware-native-performance.html</loc>
<lastmod>2021-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2021/10/01/combining-recurrent-convolutional-and-continuous-time-models-with-linear-state-space-layers.html</loc>
<lastmod>2021-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2021/10/01/efficiently-modeling-long-sequences-with-structured-state-spaces.html</loc>
<lastmod>2021-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2021/12/01/glam-efficient-scaling-of-language-models-with-mixture-of-experts.html</loc>
<lastmod>2021-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2021/12/01/torchfx-practical-program-capture-and-transformation-for-deep-learning-in-python.html</loc>
<lastmod>2021-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2022/01/01/a-compiler-framework-for-optimizing-dynamic-parallelism-on-gpus.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/framework/2022/01/01/campo-cost-aware-performance-optimization-for-mixed-precision-neural-network-training.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2022/01/01/chain-of-thought-prompting-elicits-reasoning-in-large-language-models.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2022/01/01/darm-control-flow-melding-for-simt-thread-divergence-reduction.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2022/01/01/deepspeed-moe-advancing-mixture-of-experts-inference-and-training-to-power-next-generation-ai-scale.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2022/01/01/nvidia-h100-tensor-core-gpu-architecture.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2022/01/01/transformer-quality-in-linear-time.html</loc>
<lastmod>2022-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/02/01/astitch-enabling-a-new-multi-dimensional-optimization-space-for-memory-intensive-ml-training-and-inference-on-modern-simt-architectures.html</loc>
<lastmod>2022-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2022/02/01/doubling-all2all-performance-with-nvidia-collective-communication-library-212.html</loc>
<lastmod>2022-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/02/01/neoflow-a-flexible-framework-for-enabling-efficient-compilation-for-high-performance-dnn-training.html</loc>
<lastmod>2022-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2022/02/01/st-moe-designing-stable-and-transferable-sparse-expert-models.html</loc>
<lastmod>2022-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2022/03/01/deepnet-scaling-transformers-to-1000-layers.html</loc>
<lastmod>2022-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2022/03/01/tensor-programs-v-tuning-large-neural-networks-via-zero-shot-hyperparameter-transfer.html</loc>
<lastmod>2022-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2022/03/01/training-compute-optimal-large-language-models.html</loc>
<lastmod>2022-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2022/03/01/training-language-models-to-follow-instructions-with-human-feedback.html</loc>
<lastmod>2022-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2022/04/01/palm-scaling-language-modeling-with-pathways.html</loc>
<lastmod>2022-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2022/04/01/training-a-helpful-and-harmless-assistant-with-reinforcement-learning-from-human-feedback.html</loc>
<lastmod>2022-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/05/01/dietcode-automatic-optimization-for-dynamic-tensor-programs.html</loc>
<lastmod>2022-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/cpu/2022/05/01/everything-you-need-to-know-about-the-cpu-power-management.html</loc>
<lastmod>2022-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2022/05/01/fastermoe-modeling-and-optimizing-training-of-large-scale-dynamic-pre-trained-models.html</loc>
<lastmod>2022-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2022/05/01/pathways-asynchronous-distributed-dataflow-for-ml.html</loc>
<lastmod>2022-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/cpu/2022/05/01/understanding-bios-configuration-for-performance-tuning.html</loc>
<lastmod>2022-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2022/06/01/efficiently-emulating-high-bitwidth-computation-with-low-bitwidth-hardware.html</loc>
<lastmod>2022-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2022/06/01/gmi-drl-empowering-multi-gpu-deep-reinforcement-learning-with-gpu-spatial-multiplexing.html</loc>
<lastmod>2022-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2022/06/01/tutel-adaptive-mixture-of-experts-at-scale.html</loc>
<lastmod>2022-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/07/01/alpa-automating-inter-and-intra-operator-parallelism-for-distributed-deep-learning.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2022/07/01/flashattention-fast-and-memory-efficient-exact-attention-with-io-awareness.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/07/01/microsecond-scale-preemption-for-concurrent-gpu-accelerated-dnn-inferences.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2022/07/01/orca-a-distributed-serving-system-for-transformer-based-generative-models.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2022/07/01/reexamining-direct-cache-access-to-optimize-io-intensive-applications-for-multi-hundred-gigabit-networks.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2022/07/01/roller-fast-and-efficient-tensor-compilation-for-deep-learning.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/07/01/unity-accelerating-dnn-training-through-joint-optimization-of-algebraic-transformations-and-parallelization.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2022/07/01/webshop-towards-scalable-real-world-web-interaction-with-grounded-language-agents.html</loc>
<lastmod>2022-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2022/08/01/fp8-quantization-the-power-of-the-exponent.html</loc>
<lastmod>2022-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2022/09/01/apollo-automatic-partition-based-operator-fusion-throughlayer-by-layer-optimization.html</loc>
<lastmod>2022-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2022/09/01/fp8-formats-for-deep-learning.html</loc>
<lastmod>2022-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2022/10/01/hidet-task-mapping-programming-paradigm-for-deep-learning-tensor-programs.html</loc>
<lastmod>2022-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2022/10/01/tensorir-an-abstraction-for-automatic-tensorized-program-optimization.html</loc>
<lastmod>2022-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2022/10/01/the-devil-in-linear-transformer.html</loc>
<lastmod>2022-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2022/11/01/efficiently-scaling-transformer-inference.html</loc>
<lastmod>2022-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2022/11/01/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async.html</loc>
<lastmod>2022-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2022/11/01/on-optimizing-the-communication-of-model-parallelism.html</loc>
<lastmod>2022-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2022/11/01/smoothquant-accurate-and-efficient-post-training-quantization-for-large-language-models.html</loc>
<lastmod>2022-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2022/11/01/tlp-a-deep-learning-based-cost-model-for-tensor-program-tuning.html</loc>
<lastmod>2022-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2023/01/01/a-135-gbpsgbit-066-pjbit-stacked-embedded-dram-with-multilayer-arrays-by-fine-pitch-hybrid-bonding-and-mini-tsv.html</loc>
<lastmod>2023-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/01/01/onednn-graph-compiler-a-hybrid-approach-for-high-performance-deep-learning-compilation.html</loc>
<lastmod>2023-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2023/01/01/stream-k-work-centric-parallel-decomposition-for-dense-matrix-matrix-multiplication-on-the-gpu.html</loc>
<lastmod>2023-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2023/02/01/accelerating-large-language-model-decoding-with-speculative-sampling.html</loc>
<lastmod>2023-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2023/02/01/llama-open-and-efficient-foundation-language-models.html</loc>
<lastmod>2023-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/2023/02/01/to-pack-or-not-to-pack-a-generalized-packing-analysis-and-transformation.html</loc>
<lastmod>2023-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2023/02/01/toolformer-language-models-can-teach-themselves-to-use-tools.html</loc>
<lastmod>2023-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2023/03/01/deepspeed-chat-easy-fast-and-affordable-rlhf-training-of-chatgpt-like-models-at-all-scales.html</loc>
<lastmod>2023-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/03/01/graphene-an-ir-for-optimized-tensor-computations-on-gpus.html</loc>
<lastmod>2023-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2023/03/01/reac-t-synergizing-reasoning-and-acting-in-language-models.html</loc>
<lastmod>2023-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2023/03/01/scaling-vision-language-models-with-sparse-mixture-of-experts.html</loc>
<lastmod>2023-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2023/03/01/simplified-state-space-layers-for-sequence-modeling.html</loc>
<lastmod>2023-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2023/04/01/hungry-hungry-hippos-towards-language-modeling-with-state-space-models.html</loc>
<lastmod>2023-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/04/01/pytorch-fsdp-experiences-on-scaling-fully-sharded-data-parallel.html</loc>
<lastmod>2023-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/04/01/stable-and-low-precision-training-for-large-scale-vision-language-models.html</loc>
<lastmod>2023-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/04/01/with-shared-microexponents-a-little-shifting-goes-a-long-way.html</loc>
<lastmod>2023-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2023/05/01/a-framework-for-fine-grained-synchronization-of-dependent-gpu-kernels.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/05/01/acrobat-optimizing-auto-batching-of-dynamic-deep-learning-at-compile-time.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2023/05/01/alcop-automatic-load-compute-pipelining-in-deep-learning-compiler-for-ai-gpus.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/05/01/autoscratch-ml-optimized-cache-management-for-inference-oriented-gpus.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2023/05/01/direct-preference-optimization-your-language-model-is-secretly-a-reward-model.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2023/05/01/fast-inference-from-transformers-via-speculative-decoding.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2023/05/01/hardware-compute-partitioning-on-nvidia-gpus.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/05/01/integer-or-floating-point-new-outlooks-for-low-bit-quantization-on-large-language-models.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2023/05/01/on-the-tool-manipulation-capability-of-open-source-large-language-models.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/05/01/sirius-harvesting-whole-program-optimization-opportunitiesfor-dnns.html</loc>
<lastmod>2023-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/06/01/awq-activation-aware-weight-quantization-for-on-device-llm-compression-and-acceleration.html</loc>
<lastmod>2023-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/06/01/fp8-versus-int8-for-efficient-deep-learning-inference.html</loc>
<lastmod>2023-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2023/07/01/attention-is-off-by-one.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/07/01/cocktailer-analyzing-and-optimizing-dynamic-control-flow-in-deep-learning.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/07/01/effectively-scheduling-computational-graphs-of-deep-neural-networks-toward-their-domain-specific-accelerators.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/07/01/einnet-optimizing-tensor-programs-with-derivation-based-transformations.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2023/07/01/flashattention-2-faster-attention-with-better-parallelism-and-work-partitioning.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2023/07/01/llama-2-open-foundation-and-fine-tuned-chat-models.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2023/07/01/optimizing-dynamic-neural-networks-with-brainstorm.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2023/07/01/overview-of-and-motivation-for-the-forthcoming-ultra-ethernet-consortium-specification.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/07/01/powerfusion-a-tensor-compiler-with-explicit-data-movement-description-and-instruction-level-graph-ir.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2023/07/01/rail-only-a-low-cost-high-performance-network-for-training-llms-with-trillion-parameters.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2023/07/01/scaling-transnormer-to-175-billion-parameters.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/07/01/welder-scheduling-deep-learning-memory-access-via-tile-graph.html</loc>
<lastmod>2023-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2023/08/01/sarathi-efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills.html</loc>
<lastmod>2023-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2023/09/01/deepspeed-ulysses-system-optimizations-for-enabling-training-of-extreme-long-sequence-transformer-models.html</loc>
<lastmod>2023-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2023/09/01/efficient-memory-management-for-large-language-model-serving-with-pagedattention.html</loc>
<lastmod>2023-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2023/09/01/ocp-microscaling-formats-mx-specification.html</loc>
<lastmod>2023-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2023/09/01/tree-of-thoughts-deliberate-problem-solving-with-large-language-models.html</loc>
<lastmod>2023-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2023/10/01/cachegen-kv-cache-compression-and-streaming-for-fast-large-language-model-serving.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/10/01/fault-tolerant-hybrid-parallel-training-at-scale-with-reliable-and-efficient-in-memory-checkpointing.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2023/10/01/fireact-toward-language-agent-fine-tuning.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2023/10/01/flash-decoding-for-long-context-inference.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/10/01/flextrain-a-dynamic-training-framework-for-heterogeneous-devices-environments.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/10/01/gemini-fast-failure-recovery-in-distributed-training-with-in-memory-checkpoints.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2023/10/01/nvidia-doca-gpunetio-programming-guide.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2023/10/01/ring-attention-with-blockwise-transformers-for-near-infinite-context.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2023/10/01/steerlm-attribute-conditioned-sft-as-an-user-steerable-alternative-to-rlhf.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/10/01/tackling-the-matrix-multiplication-micro-kernel-generation-with-exo.html</loc>
<lastmod>2023-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2023/11/01/gaia-a-benchmark-for-general-ai-assistants.html</loc>
<lastmod>2023-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2023/11/01/instruction-following-evaluation-for-large-language-models.html</loc>
<lastmod>2023-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2023/11/01/striped-attention-faster-ring-attention-for-causal-transformers.html</loc>
<lastmod>2023-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/11/01/zero-bubble-pipeline-parallelism.html</loc>
<lastmod>2023-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/12/01/experiences-building-an-mlir-based-sycl-compiler.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2023/12/01/gated-linear-attention-transformers-with-hardware-efficient-training.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2023/12/01/gqa-training-generalized-multi-query-transformer-models-from-multi-head-checkpoints.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2023/12/01/jitspmm-just-in-time-instruction-generation-for-accelerated-sparse-matrix-matrix-multiplication.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2023/12/01/mamba-linear-time-sequence-modeling-with-selective-state-spaces.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2023/12/01/overlap-communication-with-dependent-computation-via-decomposition-in-large-deep-learning-models.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2023/12/01/retrieval-augmented-generation-for-large-language-models-a-survey.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2023/12/01/superserve-fine-grained-inference-serving-for-unpredictable-workloads.html</loc>
<lastmod>2023-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2024/01/01/deepseek-coder-when-the-large-language-model-meets-programming-the-rise-of-code-intelligence.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/01/01/deepseekmoe-towards-ultimate-expert-specialization-in-mixture-of-experts-language-models.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/01/01/distserve-disaggregating-prefill-and-decoding-for-goodput-optimized-large-language-model-serving.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/01/01/eagle-speculative-sampling-requires-rethinking-feature-uncertainty.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/01/01/fp6-llm-efficiently-serving-large-language-models-through-fp6-centric-algorithm-system-co-design.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2024/01/01/gmlake-efficient-and-transparent-gpu-memory-defragmentation-for-large-scale-dnn-training-with-virtual-memory-stitching.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2024/01/01/lightning-attention-2-a-free-lunch-for-handling-unlimited-sequence-lengths-in-large-language-models.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/01/01/on-policy-distillation-of-language-models-learning-from-self-generated-mistakes.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/01/01/optimal-kernel-orchestration-for-tensor-programs-with-korch.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/2024/01/01/polytops-reconfigurable-and-flexible-polyhedral-scheduler.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/01/01/self-play-fine-tuning-converts-weak-language-models-to-strong-language-models.html</loc>
<lastmod>2024-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/02/01/deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-models.html</loc>
<lastmod>2024-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/02/01/massive-activations-in-large-language-models.html</loc>
<lastmod>2024-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/02/01/megascale-scaling-large-language-model-training-to-more-than-10000-gpus.html</loc>
<lastmod>2024-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2024/02/01/moe-mamba-efficient-selective-state-space-models-with-mixture-of-experts.html</loc>
<lastmod>2024-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2024/02/01/sod2-statically-optimizing-dynamic-deep-neural-network-execution.html</loc>
<lastmod>2024-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/03/01/deft-decoding-with-flash-tree-attention-for-efficient-tree-structured-llm-inference.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/2024/03/01/depyf-open-the-opaque-box-of-pytorch-compiler-for-machine-learning-researchers.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2024/03/01/gemini-15-unlocking-multimodal-understanding-across-millions-of-tokens-of-context.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2024/03/01/jamba-a-hybrid-transformer-mamba-language-model.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2024/03/01/scaling-up-test-time-compute-with-latent-reasoning-a-recurrent-depth-approach.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/03/01/scattered-mixture-of-experts-implementation.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2024/03/01/wasp-exploiting-gpu-pipeline-parallelism-with-hardware-accelerated-automatic-warp-specialization.html</loc>
<lastmod>2024-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/04/01/better-faster-large-language-models-via-multi-token-prediction.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/04/01/felix-optimizing-tensor-programs-with-gradient-descent.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/04/01/hydride-a-retargetable-and-extensible-synthesis-based-compiler-for-modern-hardware-architectures.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/04/01/leave-no-context-behind-efficient-infinite-context-transformers-with-infini-attention.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2024/04/01/linear-attention-sequence-parallelism.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2024/04/01/magis-memory-optimization-via-coordinated-graph-transformation-and-scheduling-for-dnn.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/04/01/optimizing-deep-learning-inference-via-global-analysis-and-tensor-expressions.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/04/01/optimizing-dynamic-shape-neural-networks-on-accelerators-via-on-the-fly-micro-kernel-polymerization.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2024/04/01/prompt-cache-modular-attention-reuse-for-low-latency-inference.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/04/01/proteus-a-high-throughput-inference-serving-system-with-accuracy-scaling.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2024/04/01/pytorch-2-faster-machine-learning-through-dynamic-python-bytecode-transformation-and-graph-compilation.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2024/04/01/scaling-up-memory-disaggregated-applications-with-smart.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/04/01/shortcut-connected-expert-parallelism-for-accelerating-mixture-of-experts.html</loc>
<lastmod>2024-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2024/05/01/cacheblend-fast-large-language-model-serving-for-rag-with-cached-knowledge-fusion.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/05/01/efficient-heterogeneous-large-language-model-decoding-with-model-attention-disaggregation.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2024/05/01/livecodebench-holistic-and-contamination-free-evaluation-of-large-language-models-for-code.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/05/01/memoe-enhancing-model-editing-with-mixture-of-experts-adaptors.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/05/01/nemo-aligner-scalable-toolkit-for-efficient-model-alignment.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2024/05/01/openrlhf-an-easy-to-use-scalable-and-high-performance-rlhf-framework.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/05/01/pipeline-parallelism-with-controllable-memory.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/05/01/preble-efficient-distributed-prompt-scheduling-for-llm-serving.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/05/01/splitwise-efficient-generative-llm-inference-using-phase-splitting.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2024/05/01/stacking-your-transformers-a-closer-look-at-model-growth-for-efficient-llm-pre-training.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2024/05/01/transformers-are-ssms-generalized-models-and-efficient-algorithms-through-structured-state-space-duality.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2024/05/01/various-lengths-constant-speed-efficient-language-modeling-with-lightning-attention.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2024/05/01/you-only-cache-once-decoder-decoder-architectures-for-language-models.html</loc>
<lastmod>2024-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2024/06/01/deepseek-v2-a-strong-economical-and-efficient-mixture-of-experts-language-model.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/06/01/eagle-2-faster-inference-of-language-models-with-dynamic-draft-trees.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/06/01/medusa-simple-llm-inference-acceleration-framework-with-multiple-decoding-heads.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2024/06/01/mind-the-gap-attainable-data-movement-and-operational-intensity-bounds-for-tensor-algorithms.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/06/01/protrain-efficient-llm-training-via-adaptive-memory-management.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/06/01/universal-checkpointing-a-flexible-and-efficient-distributed-checkpointing-system-for-large-scale-dnn-training-with-reconfigurable-parallelism.html</loc>
<lastmod>2024-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/07/01/accelerating-the-training-of-large-language-models-using-efficient-activation-rematerialization-and-optimal-hybrid-parallelism.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2024/07/01/cost-efficient-large-language-model-serving-for-multi-turn-conversations-with-cachedattention.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/07/01/efficient-training-of-large-language-models-on-distributed-infrastructures-a-survey.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/07/01/enabling-tensor-language-model-to-assist-in-generating-high-performance-tensor-programs-for-deep-learning.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/07/01/flashattention-3-fast-and-accurate-attention-with-asynchrony-and-low-precision.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/07/01/helpsteer2-open-source-dataset-for-training-top-performing-reward-models.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/07/01/ladder-enabling-efficient-low-precision-deep-learning-computing-through-hardware-aware-tensor-transformation.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2024/07/01/magpy-compiling-eager-mode-dnn-programs-by-monitoring-execution-states.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2024/07/01/minference-10-accelerating-pre-filling-for-long-context-llms-via-dynamic-sparse-attention.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/07/01/mooncake-a-kvcache-centric-disaggregated-architecture-for-llm-serving.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2024/07/01/nvidia-blackwell-architecture-technical-brief.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2024/07/01/scaling-laws-with-vocabulary-larger-models-deserve-larger-vocabularies.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/07/01/sglang-efficient-execution-of-structured-language-model-programs.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2024/07/01/the-llama-3-herd-of-models.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/07/01/usp-a-unified-sequence-parallelism-approach-for-long-context-generative-ai.html</loc>
<lastmod>2024-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/08/01/auxiliary-loss-free-load-balancing-strategy-for-mixture-of-experts.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/08/01/disttrain-addressing-model-and-data-heterogeneity-with-disaggregated-training-for-multimodal-large-language-models.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2024/08/01/fusechat-knowledge-fusion-of-chat-models.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2024/08/01/inference-scaling-laws-an-empirical-analysis-of-compute-optimal-inference-for-llm-problem-solving.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/08/01/lut-tensor-core-a-software-hardware-co-design-for-lut-based-low-bit-llm-inference.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2024/08/01/magicdec-breaking-the-latency-throughput-tradeoff-for-long-context-generation-with-speculative-decoding.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/08/01/marlin-mixed-precision-auto-regressive-parallel-inference-on-large-language-models.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/08/01/nanoflow-towards-optimal-large-language-model-serving-throughput.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2024/08/01/pipefusion-patch-level-pipeline-parallelism-for-diffusion-transformers-inference.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2024/08/01/scaling-llm-test-time-compute-optimally-can-be-more-effective-than-scaling-model-parameters.html</loc>
<lastmod>2024-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/09/01/domino-eliminating-communication-in-llm-training-via-generic-tensor-slicing-and-overlapping.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/09/01/hexiscale-accommodating-large-language-model-training-over-heterogeneous-environment.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2024/09/01/large-language-model-based-agents-for-software-engineering-a-survey.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2024/09/01/memory-efficiency-faster-initialization-and-cost-estimation-with-nvidia-collective-communications-library-222.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/09/01/mnemosyne-parallelization-strategies-for-efficiently-serving-multi-million-context-length-llm-inference-requests-without-approximations.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2024/09/01/prescount-effective-register-allocation-for-bank-conflict-reduction.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2024/09/01/retargeting-and-respecializing-gpu-workloads-for-performance-portability.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2024/09/01/rlhfuse-efficient-rlhf-training-for-large-language-models-with-inter-and-intra-stage-fusion.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2024/09/01/swe-bench-can-language-models-resolve-real-world-github-issues.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2024/09/01/the-landscape-of-gpu-centric-communication.html</loc>
<lastmod>2024-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2024/10/01/do-large-language-models-need-a-content-delivery-network.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/10/01/duoattention-efficient-long-context-llm-inference-with-retrieval-and-streaming-heads.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/10/01/eps-moe-expert-pipeline-scheduler-for-cost-efficient-moe-inference.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2024/10/01/flux-fast-software-based-communication-overlap-on-gpus-through-kernel-fusion.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2024/10/01/hybridflow-a-flexible-and-efficient-rlhf-framework.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2024/10/01/moe-accelerating-mixture-of-experts-methods-with-zero-computation-experts.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/10/01/sageattention-accurate-8-bit-attention-for-plug-and-play-inference-acceleration.html</loc>
<lastmod>2024-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/11/01/microscopiq-accelerating-foundational-models-through-outlier-aware-microscaling-quantization.html</loc>
<lastmod>2024-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2024/11/01/minder-faulty-machine-detection-for-large-scale-distributed-model-training.html</loc>
<lastmod>2024-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2024/11/01/sageattention2-efficient-attention-with-thorough-outlier-smoothing-and-per-thread-int4-quantization.html</loc>
<lastmod>2024-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2024/11/01/uncovering-real-gpu-noc-characteristics-implications-on-interconnect-architecture.html</loc>
<lastmod>2024-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2024/12/01/batchllm-optimizing-large-batched-llm-inference-with-global-prefix-sharing-and-throughput-oriented-token-batching.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2024/12/01/deepseek-v3-technical-report.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2024/12/01/flex-attention-a-programming-model-for-generating-optimized-attention-kernels.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2024/12/01/gated-delta-networks-improving-mamba2-with-delta-rule.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2024/12/01/mixllm-llm-quantization-with-global-mixed-precision-between-output-features-and-highly-efficient-system-design.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2024/12/01/unveiling-the-secret-recipe-a-guide-for-supervised-fine-tuning-small-llms.html</loc>
<lastmod>2024-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/accelerating-design-space-exploration-for-llm-training-systems-with-multi-experiment-parallel-simulation.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/01/01/autoccl-automated-collective-communication-tuning-for-accelerating-distributed-and-parallel-dnn-training.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/01/01/decdec-a-systems-approach-to-advancing-low-bit-llm-quantization.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/01/01/deft-decoding-with-flash-tree-attention-for-efficient-tree-structured-llm-inference.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/01/01/dissecting-and-modeling-the-architecture-of-modern-gpu-cores.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/enabling-efficient-gpu-communication-over-multiple-nics-with-fuselink.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/flexpipe-maximizing-training-efficiency-for-transformer-based-models-with-variable-length-inputs.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/jenga-enhancing-llm-long-context-fine-tuning-with-contextual-token-sparsity.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/01/01/kimi-k15-scaling-reinforcement-learning-with-llms.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2025/01/01/minimax-01-scaling-foundation-models-with-lightning-attention.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/01/01/new-scaling-algorithm-and-initialization-with-nvidia-collective-communications-library-223.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/01/01/nvidia-blackwell.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/01/01/nvidia-dgx-b300.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/01/01/nvidia-rtx-blackwell-gpu-architecture.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/obscura-concealing-recomputation-overhead-in-training-of-large-language-models-with-bubble-filling-pipeline-transformation.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2025/01/01/pipethreader-software-defined-pipelining-for-efficient-dnn-execution.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/01/01/popfetcher-towards-accelerated-mixture-of-experts-training-via-popularity-based-expert-wise-prefetch.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/system/2025/01/01/principles-and-methodologies-for-serial-performance-optimization.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/01/01/qfactory-accelerating-quantized-large-language-model-serving-with-qtile-graphs.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/01/01/weaver-efficient-multi-llm-serving-with-attention-offloading.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/01/01/zen-empowering-distributed-training-with-sparsity-driven-data-synchronization.html</loc>
<lastmod>2025-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/02/01/autellix-an-efficient-serving-engine-for-llm-agents-as-general-programs.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/02/01/bytescale-efficient-scaling-of-llm-training-with-a-2048k-context-length-on-more-than-12000-gpus.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/02/01/dreamddp-accelerating-data-parallel-distributed-llm-training-with-layer-wise-scheduled-partial-synchronization.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/02/01/easyspec-layer-parallel-speculative-decoding-for-efficient-multi-gpu-utilization.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2025/02/01/flexprefill-a-context-aware-sparse-attention-mechanism-for-efficient-long-sequence-inference.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/02/01/fmoe-fine-grained-expert-offloading-for-large-mixture-of-experts-serving.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/02/01/kvlink-accelerating-large-language-models-via-efficient-kv-cache-reuse.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/02/01/lasp-2-rethinking-sequence-parallelism-for-linear-attention-and-its-hybrid.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/02/01/mario-near-zero-cost-activation-checkpointing-in-pipeline-parallelism.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2025/02/01/moba-mixture-of-block-attention-for-long-context-llms.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2025/02/01/native-sparse-attention-hardware-aligned-and-natively-trainable-sparse-attention.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/02/01/reasoning-with-latent-thoughts-on-the-power-of-looped-transformers.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/02/01/scaling-up-muon-for-large-scale-language-model-training.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/02/01/training-llms-with-mxfp4.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/02/01/tree-attention-topology-aware-decoding-for-long-context-attention-on-gpu-clusters.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/02/01/twilight-adaptive-attention-sparsity-with-hierarchical-top-p-pruning.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/02/01/weipipe-weight-pipeline-parallelism-for-communication-effective-long-context-large-model-training.html</loc>
<lastmod>2025-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/03/01/communication-efficient-language-model-training-scales-reliably-and-robustly-scaling-laws-for-diloco.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/03/01/dissecting-and-modeling-the-architecture-of-modern-gpu-cores.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/03/01/eagle-3-scaling-up-inference-acceleration-of-large-language-models-via-training-time-test.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/03/01/networking-reliability-and-observability-at-scale-with-nccl-224.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/03/01/neutrino-fine-grained-gpu-kernel-profiling-via-programmable-probing.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/03/01/numerical-error-analysis-of-large-language-models.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/03/01/oaken-fast-and-efficient-llm-serving-with-online-offline-hybrid-kv-cache-quantization.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/03/01/r1-searcher-incentivizing-the-search-capability-in-llms-via-reinforcement-learning.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/03/01/samplemix-a-sample-wise-pre-training-data-mixing-strategey-by-coordinating-data-quality-and-diversity.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/03/01/search-r1-training-llms-to-reason-and-leverage-search-engines-with-reinforcement-learning.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/03/01/tapered-off-policy-reinforce-stable-and-efficient-reinforcement-learning-for-llms.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/03/01/tiled-flash-linear-attention-more-efficient-linear-rnn-and-xlstm-kernels.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/03/01/ub-mesh-a-hierarchically-localized-nd-fullmesh-datacenter-network-architecture.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/03/01/understanding-stragglers-in-large-model-training-using-what-if-analysis.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/03/01/wlb-llm-workload-balanced-4d-parallelism-for-large-language-model-training.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2025/03/01/xattention-block-sparse-attention-with-antidiagonal-scoring.html</loc>
<lastmod>2025-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/04/01/a-little-goes-a-long-way-efficient-long-context-training-and-inference-with-partial-contexts.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/pruning/2025/04/01/beware-of-calibration-data-for-pruning-large-language-models.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/block-verification-accelerates-speculative-decoding.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/04/01/bytecheckpoint-a-unified-checkpointing-system-for-large-foundation-model-development.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/cbq-cross-block-quantization-for-large-language-models.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/04/01/deepcoder-a-fully-open-source-14b-coder-at-o3-mini-level.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/distributed-speculative-inference-dsi-speculation-parallelism-for-provably-faster-lossless-language-model-inference.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/effective-interplay-between-sparsity-and-quantization-from-theory-to-practice.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/04/01/exploring-data-scaling-trends-and-effects-in-reinforcement-learning-from-human-feedback.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/faster-cascades-via-speculative-decoding.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/04/01/fiddler-cpu-gpu-orchestration-for-fast-inference-of-mixture-of-experts-models.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/04/01/flashmask-efficient-and-rich-mask-extension-of-flashattention.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/04/01/helios-adaptive-model-and-early-exit-selection-for-efficient-llm-inference-serving.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/04/01/how-does-critical-batch-size-scale-in-pretraining.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/04/01/hyper-connections.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/04/01/introducing-ualink-200g-10-specification.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/long-context-compression-with-activation-beacon.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/04/01/mem0-building-production-ready-ai-agents-with-scalable-long-term-memory.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/mixture-of-attentions-for-speculative-decoding.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/multi-draft-speculative-sampling-canonical-decomposition-and-theoretical-limits.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/04/01/nemotron-h-a-family-of-accurate-and-efficient-hybrid-mamba-transformer-models.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/04/01/netmoe-accelerating-moe-training-through-dynamic-sample-placement.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/not-all-heads-matter-a-head-level-kv-cache-compression-method-with-integrated-retrieval-and-reasoning.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/omnikv-dynamic-context-selection-for-efficient-long-context-llms.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/ozaki-scheme-ii-a-gemm-oriented-emulation-of-floating-point-matrix-multiplication-using-an-integer-modular-technique.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/pruning/2025/04/01/probe-pruning-accelerating-llms-through-dynamic-pruning-via-model-probing.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/progressive-mixed-precision-decoding-for-efficient-llm-inference.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/04/01/ragen-understanding-self-evolution-in-llm-agents-via-multi-turn-reinforcement-learning.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/razorattention-efficient-kv-cache-compression-through-retrieval-heads.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/04/01/retool-reinforcement-learning-for-strategic-tool-use-in-llms.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/04/01/scaling-fp8-training-to-trillion-token-llms.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/scaling-laws-for-precision.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/scbench-a-kv-cache-centric-analysis-of-long-context-methods.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/04/01/seed15-thinking-advancing-superb-reasoning-models-with-reinforcement-learning.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/04/01/simai-unifying-architecture-design-and-performance-tuning-for-large-scale-large-language-model-training-with-scalability-and-precision.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/04/01/squeezeattention-2d-management-of-kvcache-in-llm-inference-via-layer-wise-optimal-budget.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/04/01/streamrl-scalable-heterogeneous-and-elastic-rl-for-llms-with-disaggregated-stream-generation.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/swift-on-the-fly-self-speculative-decoding-for-llm-inference-acceleration.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/04/01/thunderkittens-simple-fast-and-adorable-kernels.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/04/01/tilelang-a-composable-tiled-programming-model-for-ai-systems.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/04/01/tilelink-generating-efficient-compute-communication-overlapping-kernels-using-tile-centric-primitives.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/04/01/tilus-a-virtual-machine-for-arbitrary-low-precision-gpgpu-computation-in-llm-serving.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/04/01/torchtitan-one-stop-pytorch-native-solution-for-production-ready-llm-pretraining.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/04/01/towards-optimal-multi-draft-speculative-decoding.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/04/01/turboquant-online-vector-quantization-with-near-optimal-distortion-rate.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/04/01/vl-cache-sparsity-and-modality-aware-kv-cache-compression-for-vision-language-model-inference-acceleration.html</loc>
<lastmod>2025-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/05/01/a-survey-on-test-time-scaling-in-large-language-models-what-how-where-and-how-well.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2025/05/01/accelerating-diffusion-llms-via-adaptive-parallel-decoding.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/05/01/areal-a-large-scale-asynchronous-reinforcement-learning-system-for-language-reasoning.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/05/01/dapo-an-open-source-llm-reinforcement-learning-system-at-scale.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/05/01/ecco-improving-memory-bandwidth-and-capacity-for-llms-via-entropy-aware-cache-compression.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/05/01/flashmla-etap-efficient-transpose-attention-pipeline-for-accelerating-mla-inference-on-nvidia-h20-gpus.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/05/01/flashtensor-optimizing-tensor-programs-by-leveraging-fine-grained-tensor-property.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/05/01/gllm-global-balanced-pipeline-parallelism-system-for-distributed-llm-serving-with-token-throttling.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/05/01/insights-into-deepseek-v3-scaling-challenges-and-reflections-on-hardware-for-ai-architectures.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/05/01/intellect-2-a-reasoning-model-trained-through-globally-decentralized-reinforcement-learning.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2025/05/01/kperfir-towards-an-open-and-compiler-centric-ecosystem-for-gpu-kernel-performance-tooling-on-modern-ai-workloads.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/05/01/llamarl-a-distributed-asynchronous-reinforcement-learning-framework-for-efficient-large-scale-llm-training.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/05/01/mimo-unlocking-the-reasoning-potential-of-language-model-from-pretraining-to-posttraining.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/05/01/moesd-unveil-speculative-decodings-potential-for-accelerating-sparse-moe.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/05/01/prism-unleashing-gpu-sharing-for-cost-efficient-multi-llm-serving.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/05/01/prorl-prolonged-reinforcement-learning-expands-reasoning-boundaries-in-large-language-models.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/05/01/qwen3-technical-report.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/05/01/recipes-for-pre-training-llms-with-mxfp8.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/05/01/sageattention3-microscaling-fp4-attention-for-inference-and-an-exploration-of-8-bit-training.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2025/05/01/sparse-videogen2-accelerate-video-generation-with-sparse-attention-via-semantic-aware-permutation.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/05/01/the-entropy-mechanism-of-reinforcement-learning-for-reasoning-language-models.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/05/01/tokenweave-efficient-compute-communication-overlap-for-distributed-llm-inference.html</loc>
<lastmod>2025-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/06/01/compress-gather-and-recompute-reforming-long-context-processing-in-transformers.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/06/01/contextcache-context-aware-semantic-cache-for-multi-turn-queries-in-large-language-models.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/06/01/cost-efficient-llm-training-with-lifetime-aware-tensor-offloading-via-gpudirect-storage.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/system/2025/06/01/decomposing-craft-an-elementary-grammar-for-sharing-expertise-in-craft-workflows.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/06/01/dotsllm1-technical-report.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/06/01/flashdmoe-fast-distributed-moe-in-a-single-kernel.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/06/01/gated-attention-for-large-language-models-non-linearity-sparsity-and-attention-sink-free.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/06/01/improved-performance-and-monitoring-capabilities-with-nvidia-collective-communications-library-226.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/06/01/kvcache-cache-in-the-wild-characterizing-and-optimizing-kvcache-cache-at-a-large-cloud-provider.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/system/2025/06/01/leann-a-low-storage-overhead-vector-index.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/06/01/lia-a-single-gpu-llm-inference-acceleration-with-cooperative-amx-enabled-cpu-gpu-computation-and-cxl-offloading.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/06/01/meshslice-efficient-2d-tensor-parallelism-for-distributed-dnn-training.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/06/01/mirage-a-multi-level-superoptimizer-for-tensor-programs.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/06/01/multipole-attention-for-efficient-long-context-reasoning.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/06/01/reinforcement-learning-optimization-for-large-scale-learning-an-efficient-and-user-friendly-scaling-library.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/06/01/scaling-llama-3-training-with-efficient-parallelism-strategies.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/06/01/scaling-speculative-decoding-with-lookahead-reasoning.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/06/01/serving-large-language-models-on-huawei-cloudmatrix384.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/2025/06/01/spark-transformer-reactivating-sparsity-in-ffn-and-attention.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/06/01/streambp-memory-efficient-exact-backpropagation-for-long-sequence-training-of-llms.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/06/01/td-pipe-temporally-disaggregated-pipeline-parallelism-architecture-for-high-throughput-llm-inference.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/06/01/understanding-and-mitigating-numerical-sources-of-nondeterminism-in-llm-inference.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/06/01/yggdrasil-bridging-dynamic-speculation-and-static-runtime-for-latency-optimal-tree-based-llm-decoding.html</loc>
<lastmod>2025-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/07/01/asyncflow-an-asynchronous-streaming-rl-framework-for-efficient-llm-post-training.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/07/01/demystifying-nccl-an-in-depth-analysis-of-gpu-communication-protocols-and-algorithms.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/07/01/dissecting-the-nvidia-blackwell-architecture-with-microbenchmarks.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/07/01/distflow-a-fully-distributed-rl-framework-for-scalable-and-efficient-llm-post-training.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/graph/2025/07/01/elk-exploring-the-efficiency-of-inter-core-connected-ai-chips-with-deep-learning-compiler-techniques.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/07/01/enabling-fast-inference-and-resilient-training-with-nccl-227.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2025/07/01/fast-dllm-training-free-acceleration-of-diffusion-llm-by-enabling-kv-cache-and-parallel-decoding.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/07/01/group-sequence-policy-optimization.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/07/01/helix-parallelism-rethinking-sharding-strategies-for-interactive-multi-million-token-llm-decoding.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/07/01/kvflow-efficient-prefix-caching-for-accelerating-llm-based-multi-agent-workflows.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/07/01/megascale-infer-serving-mixture-of-experts-at-scale-with-disaggregated-expert-parallelism.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/07/01/memagent-reshaping-long-context-llm-with-multi-conv-rl-based-memory-agent.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2025/07/01/overcoming-long-context-limitations-of-state-space-models-via-context-dependent-sparse-attention.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/07/01/scale-up-ethernet-framework-scale-up-ethernet-framework-specification.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/07/01/step-3-is-large-yet-affordable-model-system-co-design-for-cost-effective-decoding.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/07/01/zeco-zero-communication-overhead-sequence-parallelism-for-linear-attention.html</loc>
<lastmod>2025-07-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/08/01/agent-lightning-train-any-ai-agents-with-reinforcement-learning.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/08/01/an-extensible-software-transport-layer-for-gpu-networking.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/linear/2025/08/01/artificial-hippocampus-networks-for-efficient-long-context-modeling.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/08/01/clusterfusion-expanding-operator-fusion-scope-for-llm-inference-via-cluster-level-collective-primitive.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/08/01/fp4-all-the-way-fully-quantized-training-of-llms.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/08/01/glm-45-agentic-reasoning-and-coding-arc-foundation-models.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/08/01/hierarchical-reasoning-model.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/08/01/kimi-k2-open-agentic-intelligence.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/08/01/kling-omni-technical-report.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/08/01/longcat-flash-technical-report.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2025/08/01/mcp-bench-benchmarking-tool-using-llm-agents-with-complex-real-world-tasks-via-mcp-servers.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/08/01/mixture-of-contexts-for-long-video-generation.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/08/01/nvidia-nemotron-nano-2-an-accurate-and-efficient-hybrid-mamba-transformer-reasoning-model.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/08/01/on-policy-rl-meets-off-policy-experts-harmonizing-supervised-fine-tuning-and-reinforcement-learning-via-dynamic-weighting.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/08/01/optimus-accelerating-large-scale-multi-modal-llm-training-by-bubble-exploitation.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/08/01/rstar2-agent-agentic-reasoning-technical-report.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/08/01/seamlessflow-a-traineragent-isolation-rl-framework-achieving-bubble-free-pipelines-via-tag-scheduling.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/08/01/towards-efficient-and-practical-gpu-multitasking-in-the-era-of-llm.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/08/01/tricks-or-traps-a-deep-dive-into-rl-for-llm-reasoning.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/08/01/veomni-scaling-any-modality-model-training-with-model-centric-distributed-recipe-zoo.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/08/01/your-efficient-rl-framework-secretly-brings-you-offpolicy-rl-training.html</loc>
<lastmod>2025-08-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/09/01/accurate-kv-cache-eviction-via-anchor-direction-projection-for-efficient-llm-inference.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/09/01/categorical-foundations-for-cute-layouts.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/09/01/defeating-nondeterminism-in-llm-inference.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/09/01/diep-adaptive-mixture-of-experts-compression-through-differentiable-expert-pruning.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/09/01/effective-context-engineering-for-ai-agents.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/09/01/efficient-pre-training-of-llms-via-topology-aware-communication-alignment-on-more-than-9600-gpus.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/09/01/expert-as-a-service-towards-efficient-scalable-and-robust-large-scale-moe-serving.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/2025/09/01/fast-attention-mechanisms-a-tale-of-parallelism.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2025/09/01/fast-dllm-v2-efficient-block-diffusion-llm.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/09/01/flowmoe-a-scalable-pipeline-scheduling-framework-for-distributed-mixture-of-experts-training.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/09/01/learned-prefix-caching-for-efficient-llm-inference.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/09/01/let-the-llm-stick-to-its-strengths-learning-to-route-economical-llm.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/09/01/liquidgemm-hardware-efficient-w4a8-gemm-kernel-for-high-performance-llm-serving.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/09/01/longcat-flash-thinking-technical-report.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/09/01/mimo-audio-audio-language-models-are-few-shot-learners.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/09/01/pipelinerl-faster-on-policy-reinforcement-learning-for-long-sequence-generation.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/09/01/q-palette-fractional-bit-quantizers-toward-optimal-bit-allocation-for-efficient-llm-deployment.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/09/01/robust-llm-training-infrastructure-at-bytedance.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/09/01/rollpacker-mitigating-long-tail-rollouts-for-fast-synchronous-rl-post-training.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/09/01/scaling-llm-test-time-compute-with-mobile-npu-on-smartphones.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/09/01/seerattention-self-distilled-attention-gating-for-efficient-long-context-prefilling.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/09/01/sla-beyond-sparsity-in-diffusion-transformers-via-fine-tunable-sparselinear-attention.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/compiler/tensor/2025/09/01/streamtensor-make-tensors-stream-in-dataflow-accelerators-for-llms.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/09/01/the-landscape-of-agentic-reinforcement-learning-for-llms-a-survey.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/09/01/transcending-cost-quality-tradeoff-in-agent-serving-via-session-awareness.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/09/01/ui-tars-2-technical-report-advancing-gui-agent-with-multi-turn-reinforcement-learning.html</loc>
<lastmod>2025-09-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/10/01/asymmetric-proximal-policy-optimization-mini-critics-boost-llm-reasoning.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/10/01/axcore-a-quantization-aware-approximate-gemm-unit-for-llm-inference.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/10/01/cage-curvature-aware-gradient-estimation-for-accurate-quantization-aware-training.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/10/01/chunkkv-semantic-preserving-kv-cache-compression-for-efficient-long-context-llm-inference.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/10/01/coruscant-co-designing-gpu-kernel-and-sparse-tensor-core-to-advocate-unstructured-sparsity-in-efficient-llm-inference.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/10/01/deepseek-ocr-contexts-optical-compression.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/cpu/2025/10/01/dram-fault-classification-through-large-scale-field-monitoring-for-robust-memory-ras-management.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/efficient-long-context-language-model-training-by-core-attention-disaggregation.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/10/01/from-tokens-to-layers-redefining-stall-free-scheduling-for-llm-serving-with-layered-prefill.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/10/01/griffin-effective-token-alignment-for-faster-speculative-decoding.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/hierarchical-balance-packing-towards-efficient-supervised-fine-tuning-for-long-context-llm.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/10/01/hybridep-scaling-expert-parallelism-to-cross-datacenter-scenario-via-hybrid-expertdata-transmission.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/10/01/kelle-co-design-kv-caching-and-edram-for-efficient-llm-serving-in-edge-computing.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/10/01/kimi-linear-an-expressive-efficient-attention-architecture.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/10/01/kvcomm-online-cross-context-kv-cache-communication-for-efficient-llm-based-multi-agent-systems.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/10/01/leveraging-chiplet-locality-for-efficient-memory-mapping-in-multi-chip-module-gpus.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/10/01/longcat-flash-omni-technical-report.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/mixtures-of-subspaces-for-bandwidth-efficient-context-parallel-training.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/mtraining-distributed-dynamic-sparse-attention-for-efficient-ultra-long-context-training.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/mtraining-efficient-distributed-training-for-ultra-long-contexts-via-dynamic-sparse-attention.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/10/01/mx-pushing-the-limits-of-microscaling-formats-for-efficient-large-language-model-serving.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/10/01/netzip-algorithmhardware-co-design-of-in-network-lossless-compression-for-distributed-large-model-training.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/10/01/optimizing-all-to-all-collective-communication-with-fault-tolerance-on-torus-networks.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/10/01/parallel-loop-transformer-for-efficient-test-time-computation-scaling.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/10/01/part-ii-roll-flash-accelerating-rlvr-and-agentic-training-with-asynchrony.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/10/01/previewing-uccl-ep-flexible-and-efficient-expert-parallelism-for-cloud-and-beyond.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/10/01/rdma-point-to-point-communication-for-llm-systems.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/10/01/skipreduce-interconnection-network-sparsity-to-accelerate-distributed-machine-learning.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/10/01/speculate-deep-and-accurate-lossless-and-training-free-acceleration-for-offloaded-llms-via-substitute-speculative-decoding.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/10/01/stabilizing-moe-reinforcement-learning-by-aligning-training-and-inference-routers.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/10/01/stratum-system-hardware-co-design-with-tiered-monolithic-3d-stackable-dram-for-efficient-moe-serving.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/10/01/supermesh-energy-efficient-collective-communications-for-accelerators.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/synergistic-tensor-and-pipeline-parallelism.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/10/01/tail-optimized-caching-for-llm-inference.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/10/01/tasp-topology-aware-sequence-parallelism.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/10/01/tawa-automatic-warp-specialization-for-modern-gpus-with-asynchronous-references.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/10/01/towards-fully-fp8-gemm-llm-training-at-scale.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/10/01/when-to-reason-semantic-router-for-vllm.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/10/01/why-low-precision-transformer-training-fails-an-analysis-on-flash-attention.html</loc>
<lastmod>2025-10-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/11/01/beat-the-long-tail-distribution-aware-speculative-decoding-for-rl-training.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/11/01/contextpilot-fast-long-context-inference-via-context-reuse.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/11/01/continuum-efficient-and-robust-multi-turn-llm-agent-scheduling-with-kv-cache-time-to-live.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/11/01/deepseek-v32-exp-boosting-long-context-efficiency-with-deepseek-sparse-attention.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/11/01/deterministic-inference-across-tensor-parallel-sizes-that-eliminates-traininginference-mismatch.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/11/01/evolm-in-search-of-lost-language-model-training-dynamics.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/11/01/farskip-collective-unhobbling-blocking-communication-in-mixture-of-experts-models.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/11/01/flashmoe-fast-distributed-moe-in-a-single-kernel.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/11/01/flexicache-leveraging-temporal-stability-of-attention-heads-for-efficient-kv-cache-management.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/11/01/fp8-flow-moe-a-casting-free-fp8-recipe-without-double-quantization-error.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2025/11/01/fusing-communication-and-compute-with-new-device-api-and-copy-engine-collectives-in-nvidia-nccl-228.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/11/01/gemini-3-pro-model-card.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/11/01/gpu-initiated-networking-for-nccl.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/11/01/hunyuanocr-technical-report.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/11/01/intattention-a-fully-integer-attention-pipeline-for-efficient-edge-inference.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/11/01/kitty-accurate-and-efficient-2-bit-kv-cache-quantization-with-dynamic-channel-wise-precision-boost.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/11/01/nested-learning-the-illusion-of-deep-learning-architectures.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/11/01/opportunistic-expert-activation-batch-aware-expert-routing-for-faster-decode-without-retraining.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/11/01/prime-rl-async-decentralized-rl-training-at-scale.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/11/01/quartet-native-fp4-training-can-be-optimal-for-large-language-models.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/11/01/scaling-latent-reasoning-via-looped-language-models.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/11/01/seer-online-context-learning-for-fast-synchronous-llm-reinforcement-learning.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/11/01/skyrl-agent-efficient-rl-training-for-multi-turn-llm-agent.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/11/01/specdiff-2-scaling-diffusion-drafter-alignment-for-faster-speculative-decoding.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/11/01/suffixdecoding-extreme-speculative-decoding-for-emerging-ai-applications.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/11/01/system-card-claude-opus-45.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2025/11/01/tensor-parallelism-with-partially-synchronized-activations.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/11/01/tree-training-accelerating-agentic-llms-training-via-shared-prefix-reuse.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/2025/11/01/virtual-width-networks.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/2025/11/01/weight-sparse-transformers-have-interpretable-circuits.html</loc>
<lastmod>2025-11-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/12/01/accelerating-large-scale-reasoning-model-inference-self-speculative-decoding-with-sparse-attention.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2025/12/01/beluga-a-cxl-based-memory-architecture-for-scalable-and-efficient-llm-kvcache-management.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2025/12/01/blasst-dynamic-blocked-attention-sparsity-via-softmax-thresholding.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/low_precision/2025/12/01/codegemm-a-codebook-centric-approach-to-efficient-gemm-in-quantized-llms.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/deepseek-v32-pushing-the-frontier-of-open-large-language-models.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/12/01/dynapipe-dynamic-layer-redistribution-for-efficient-serving-of-llms-with-pipeline-parallelism.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/12/01/efficient-low-rank-attention-for-long-context-inference-in-large-language-models.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/12/01/let-it-flow-agentic-crafting-on-rock-and-roll-building-the-rome-model-within-an-open-agentic-learning-ecosystem.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/12/01/mesh-attention-a-new-communication-efficient-distributed-attention-with-improved-data-locality.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/12/01/mhc-manifold-constrained-hyper-connections.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/mimo-v2-flash-technical-report.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/compiler/2025/12/01/mirage-persistent-kernel-a-compiler-and-runtime-for-mega-kernelizing-tensor-programs.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2025/12/01/mma-sim-bit-accurate-reference-model-of-tensor-cores-and-matrix-cores.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2025/12/01/native-parallel-reasoner-reasoning-in-parallelism-via-self-distilled-reinforcement-learning.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/nemotron-3-nano-open-efficient-mixture-of-experts-hybrid-mamba-transformer-model-for-agentic-reasoning.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/nvidia-nemotron-3-efficient-and-open-intelligence.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2025/12/01/on-the-interplay-of-pre-training-mid-training-and-rl-on-reasoning-language-models.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/qwenlong-l15-post-training-recipe-for-long-context-reasoning-and-memory-management.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/diffusions/dllm/2025/12/01/radial-attention-on-log-n-sparse-attention-with-energy-decay-for-long-video-generation.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2025/12/01/rlax-large-scale-distributed-reinforcement-learning-for-large-language-models-on-tpus.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/seed18-model-card-towards-generalized-real-world-agency.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2025/12/01/skipkv-selective-skipping-of-kv-generation-and-storage-for-efficient-inference-with-large-reasoning-models.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2025/12/01/skrull-towards-efficient-long-context-fine-tuning-through-dynamic-data-scheduling.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/pretrain_sft/2025/12/01/skyladder-better-and-faster-pretraining-via-context-window-scheduling.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2025/12/01/sonicmoe-accelerating-moe-with-io-and-tile-aware-optimizations.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2025/12/01/state-of-ai-an-empirical-100-trillion-token-study-with-openrouter.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2025/12/01/tensor-product-attention-is-all-you-need.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/eval/2025/12/01/the-llm-evaluation-guidebook.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2025/12/01/towards-a-science-of-scaling-agent-systems.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2025/12/01/universal-reasoning-model.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2025/12/01/update-to-gpt-5-system-card-gpt-52.html</loc>
<lastmod>2025-12-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/01/01/areal-dta-dynamic-tree-attention-for-efficient-reinforcement-learning-of-large-language-models.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2026/01/01/conditional-memory-via-scalable-lookup-a-new-axis-of-sparsity-for-large-language-models.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/01/01/dflash-block-diffusion-for-flash-speculative-decoding.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2026/01/01/fast-weight-product-key-memory.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2026/01/01/flashattention-t-towards-fully-tensorized-attention-by-exploiting-tensor-vector-parallelism.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2026/01/01/flashinfer-bench-building-the-virtuous-cycle-for-ai-driven-llm-systems.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/01/01/iquest-coder-v1-technical-report.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/01/01/jet-rl-enabling-on-policy-fp8-reinforcement-learning-with-unified-training-and-rollout-precision-flow.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2026/01/01/laps-a-length-aware-prefill-llm-serving-system.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/01/01/least-loaded-expert-parallelism-load-balancing-an-imbalanced-mixture-of-experts.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/01/01/llm-42-enabling-determinism-in-llm-inference-with-verified-speculation.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2026/01/01/mhc-lite-you-dont-need-20-sinkhorn-knopp-iterations.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/transformer-variant/2026/01/01/mhla-restoring-expressivity-of-linear-attention-via-token-level-multi-head.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/01/01/moeblaze-breaking-the-memory-wall-for-efficient-moe-training-on-modern-gpus.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/01/01/reinforcement-learning-via-self-distillation.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/01/01/scaling-embeddings-outperforms-scaling-experts-in-language-models.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/framework/2026/01/01/vibetensor-system-software-for-deep-learning-fully-generated-by-ai-agents.html</loc>
<lastmod>2026-01-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2026/02/01/boute-cost-efficient-llm-serving-with-heterogeneous-llms-and-gpus-via-multi-objective-bayesian-optimization.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/gpu/2026/02/01/cuda-agent-large-scale-agentic-rl-for-high-performance-cuda-kernel-generation.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/02/01/dflash-block-diffusion-for-flash-speculative-decoding.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/02/01/dualpath-breaking-the-storage-bandwidth-bottleneck-in-agentic-llm-inference.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/02/01/echo-2-a-large-scale-distributed-rollout-framework-for-cost-efficient-reinforcement-learning.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/02/01/forge-scalable-agent-rl-framework-and-algorithm.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/02/01/glm-5-from-vibe-coding-to-agentic-engineering.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/02/01/kimi-k25-visual-agentic-intelligence.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/02/01/moe-spec-expert-budgeting-for-efficient-speculative-decoding.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/02/01/p-eagle-parallel-drafting-eagle-with-scalable-training.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/02/01/rlhfless-serverless-computing-for-efficient-rlhf.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/02/01/seed20-model-card-towards-intelligence-frontier-for-real-world-complexity.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/02/01/system-card-claude-opus-46.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/02/01/thunderagent-a-simple-fast-and-program-aware-agentic-inference-system.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/02/01/understanding-and-exploiting-weight-update-sparsity-for-communication-efficient-distributed-rl.html</loc>
<lastmod>2026-02-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/03/01/arl-tangram-unleash-the-resource-efficiency-in-agentic-reinforcement-learning.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2026/03/01/avo-agentic-variation-operators-for-autonomous-evolutionary-search.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2026/03/01/do-phone-use-agents-respect-your-privacy.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/attention/2026/03/01/flashattention-4-algorithm-and-kernel-pipelining-co-design-for-asymmetric-hardware-scaling.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/03/01/flashprefill-instantaneous-pattern-discovery-and-thresholding-for-ultra-fast-long-context-prefilling.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/03/01/heddle-a-distributed-orchestration-system-for-agentic-rl-rollout.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/ssm/2026/03/01/mamba-3-improved-sequence-modeling-using-state-space-principles.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/sparsity/2026/03/01/mixture-of-depths-attention.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/nccl/2026/03/01/nccl-ep-towards-a-unified-expert-parallel-communication-api-for-nccl.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2026/03/01/nest-network-and-memory-aware-device-placement-for-distributed-deep-learning.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/03/01/prorl-agent-rollout-as-a-service-for-rl-training-of-multi-turn-llm-agents.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/03/01/scalable-training-of-mixture-of-experts-models-with-megatron-core.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/speculative_decoding/2026/03/01/speculative-speculative-decoding.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/architecture/attention/2026/03/01/technical-report-of-attention-residuals.html</loc>
<lastmod>2026-03-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/rl/2026/03/12/rlax-large-scale-distributed-reinforcement-learning-for-large-language-models-on-tpus.html</loc>
<lastmod>2026-03-12T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/agent/2026/03/25/composer-2-technical-report.html</loc>
<lastmod>2026-03-25T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/system/2026/03/25/modern-code-review-a-case-study-at-google.html</loc>
<lastmod>2026-03-25T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2026/04/01/blink-cpu-free-llm-inference-by-delegating-the-serving-stack-to-gpu-and-smartnic.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/04/01/deepseek-v4-towards-highly-efficient-million-token-context-intelligence.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/2026/04/01/dwdp-distributed-weight-data-parallelism-for-high-performance-llm-inference-on-nvl72.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/04/01/prefill-as-a-service-kvcache-of-next-generation-models-could-go-cross-datacenter.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/04/01/routing-free-mixture-of-experts.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/04/01/scalable-pretraining-of-large-mixture-of-experts-language-models-on-aurora-super-computer.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/04/01/system-card-claude-mythos-preview.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/04/01/the-illusion-of-equivalence-systematic-fp16-divergence-in-kv-cached-autoregressive-inference.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/04/01/tokendance-scaling-multi-agent-llm-serving-via-collective-kv-cache-sharing.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/04/01/triattention-efficient-long-reasoning-with-trigonometric-kv-compression.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/04/01/when-rl-meets-adaptive-speculative-training-a-unified-trainingserving-system.html</loc>
<lastmod>2026-04-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/05/01/disagmoe-computation-communication-overlapped-moe-training-via-disaggregated-af-pipe-parallelism.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/mlsys/networking/2026/05/01/eliminating-hidden-serialization-in-multi-node-megakernel-communication.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2026/05/01/megascale-omni-a-hyper-scale-workload-resilient-system-for-multimodal-llm-training-in-production.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/05/01/pithtrain-a-compact-and-agent-native-moe-training-system.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/rl/2026/05/01/polar-agentic-rl-on-any-harness-at-scale.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/train/2026/05/01/pretraining-large-language-models-with-mxfp4-on-native-fp4-hardware.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/05/01/the-minimax-m2-series-mini-activations-unleashing-max-real-world-intelligence.html</loc>
<lastmod>2026-05-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/06/01/a-visual-guide-to-gemma-4-12b.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/06/01/gemma-4-12b-the-developer-guide.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/06/01/introducing-gemma-4-12b-a-unified-encoder-free-multimodal-model.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/06/01/mai-thinking-1-building-a-hill-climbing-machine.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/inference/kvcache/2026/06/01/momentkv-closing-the-directional-gap-in-kv-cache-eviction-for-long-context-inference.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/algorithm/models/2026/06/01/nemotron-3-ultra-open-efficient-mixture-of-experts-hybrid-mamba-transformer-model-for-agentic-reasoning.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/papers/llm/engineering/moe/2026/06/01/ultraep-unleash-moe-training-and-inference-on-rack-scale-nodes-with-near-optimal-load-balancing.html</loc>
<lastmod>2026-06-01T00:00:00+00:00</lastmod>
</url>
<url>
<loc>https://www.papercache.org/</loc>
</url>
<url>
<loc>https://www.papercache.org/collection.html</loc>
</url>
<url>
<loc>https://www.papercache.org/about/</loc>
</url>
<url>
<loc>https://www.papercache.org/account/favorites.html</loc>
</url>
<url>
<loc>https://www.papercache.org/feeds/</loc>
</url>
<url>
<loc>https://www.papercache.org/admin/</loc>
</url>
<url>
<loc>https://www.papercache.org/account/profile.html</loc>
</url>
<url>
<loc>https://www.papercache.org/auth/reset-password.html</loc>
</url>
<url>
<loc>https://www.papercache.org/auth/verify.html</loc>
</url>
</urlset>
