ai-post-transformers/queue.json at main · mcgrof/ai-post-transformers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
  "bridge": [
    {
      "arxiv_id": "2601.03067",
      "title": "Joint Encoding of KV-Cache Blocks for Scalable LLM Serving",
      "abstract": "Modern large language models (LLMs) drive interactive AI systems but are bottlenecked by the memory-heavy growth of key-value (KV) caches, which limits real-time throughput under concurrent loads. Existing KV-cache compression methods rely on rigid heuristics, disrupt tensor layouts, or require specialized compute, hindering scalability and deployment.   We propose joint encoding of KV-cache blocks, which fuses similar blocks across requests and input chunks into shared representations while preserving standard cache structure. This alleviates the KV-cache memory bottleneck, supporting high-concurrency serving without specialized hardware. Theoretically, we analyze the rate-distortion tradeoff of fused cache blocks under a Poisson process model. Empirically, our method achieves up to 4.38 $\\times$ KV-cache compression with negligible accuracy loss across diverse LLMs and benchmarks, outperforming recent structured and adaptive compression baselines. In real LLM serving, joint encoding improves the token throughput by $\\sim$40\\% on a single-machine vLLM benchmark, demonstrating substantial gains in inference throughput. Code is available at https://github.com/sef1/kv_fast_fusion  kv_joint_encoding.",
      "authors": [
        "Joseph Kampeas",
        "Emir Haleva"
      ],
      "published_at": "2026-01-06T14:50:58",
      "categories": [
        "cs.LG",
        "cs.AI"
      ],
      "url": "http://arxiv.org/abs/2601.03067v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 1,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.5128501057624817,
      "sim_memory": 0.6588089466094971,
      "sim_negative": 0.16072729229927063,
      "broad_relevance": 0.5128501057624817,
      "momentum": 0.017490950964499406,
      "teachability": 0.85,
      "novelty_score": 0.37791067361831665,
      "evidence_score": 0.5793147180559945,
      "direct_memory_relevance": 0.6588089466094971,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.77,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5744370306727911,
      "memory_score": 0.6459604501495684,
      "quality_score": 0.5727258872223978,
      "bridge_score": 0.5744370306727911,
      "max_axis_score": 0.6459604501495684,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Theory",
        "Inference"
      ],
      "status": "Cover now",
      "why_now": "KV-cache growth is one of the most immediate bottlenecks in high-concurrency LLM serving, and this paper proposes a deployable compression approach that preserves standard cache structure while showing throughput gains in vLLM. It squarely hits current production concerns around memory capacity, serving efficiency, and system scalability without requiring exotic hardware.",
      "why_not_higher": "The topic is more infrastructure-focused than broadly model-architectural, so it may not land with every general AI listener. The empirical story is strong but still early, with limited evidence yet on very large-scale production heterogeneity, long-context extremes, and operational tradeoffs beyond the reported benchmarks.",
      "downgrade_reasons": [
        "serving-systems topic is narrower than frontier model capability papers",
        "evidence appears centered on single-machine vLLM evaluation",
        "compression/fusion methods in KV cache are an active crowded area"
      ],
      "what_would_raise_priority": "Independent replication across more serving stacks, larger models, multi-node deployments, and long-context workloads would make this an even stronger must-cover paper.",
      "one_sentence_episode_hook": "What if LLM servers could stop treating every request's KV cache as isolated baggage and instead fuse similar cache blocks for a 4.38x memory win and roughly 40% more throughput?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03043",
      "title": "Lil: Less is Less When Applying Post-Training Sparse-Attention Algorithms in Long-Decode Stage",
      "abstract": "Large language models (LLMs) demonstrate strong capabilities across a wide range of complex tasks and are increasingly deployed at scale, placing significant demands on inference efficiency. Prior work typically decomposes inference into prefill and decode stages, with the decode stage dominating total latency. To reduce time and memory complexity in the decode stage, a line of work introduces sparse-attention algorithms. In this paper, we show, both empirically and theoretically, that sparse attention can paradoxically increase end-to-end complexity: information loss often induces significantly longer sequences, a phenomenon we term ``Less is Less'' (Lil). To mitigate the Lil problem, we propose an early-stopping algorithm that detects the threshold where information loss exceeds information gain during sparse decoding. Our early-stopping algorithm reduces token consumption by up to 90% with a marginal accuracy degradation of less than 2% across reasoning-intensive benchmarks.",
      "authors": [
        "Junhao Hu",
        "Fangze Li",
        "Mingtao Xu",
        "Feifan Meng",
        "Shiju Zhao",
        "Tiancheng Hu",
        "Ting Peng",
        "Anmin Liu",
        "Wenrui Huang",
        "Chenxu Liu"
      ],
      "published_at": "2026-01-06T14:23:58",
      "categories": [
        "cs.CL",
        "cs.AI",
        "cs.LG"
      ],
      "url": "http://arxiv.org/abs/2601.03043v2",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 2,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.5221384167671204,
      "sim_memory": 0.5686707496643066,
      "sim_negative": 0.24759384989738464,
      "broad_relevance": 0.5221384167671204,
      "momentum": 0.031333575283049045,
      "teachability": 0.85,
      "novelty_score": 0.3494328260421753,
      "evidence_score": 0.529861228866811,
      "direct_memory_relevance": 0.5686707496643066,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.25,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.7,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5418023483230938,
      "memory_score": 0.4247845723633093,
      "quality_score": 0.5889444915467243,
      "bridge_score": 0.4247845723633093,
      "max_axis_score": 0.5418023483230938,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Adjacent",
        "Systems",
        "Inference",
        "Theory"
      ],
      "status": "Monitor",
      "why_now": "Sparse-attention decode tricks are actively being explored to cut long-context serving cost, and this paper argues a counterintuitive failure mode: making attention cheaper can lengthen generations enough to erase the win. That is timely for anyone evaluating post-training inference optimizations on reasoning-heavy LLM workloads.",
      "why_not_higher": "The memory/storage link is real but indirect: the core contribution is an evaluation and stopping-policy warning about sparse decode, not a new cache/offload/paging mechanism. It also looks more like a benchmark-plus-mitigation result than a broadly adopted serving recipe at this stage.",
      "downgrade_reasons": [
        "memory relevance is adjacent rather than core",
        "focuses on sparse-attention decode rather than a general serving stack change",
        "unclear breadth across production models and hardware settings",
        "may be a cautionary benchmark result more than a lasting systems primitive"
      ],
      "what_would_raise_priority": "Show robust gains across major open and closed model families, realistic long-context serving traces, and direct end-to-end latency/memory improvements versus standard KV-cache-efficient baselines.",
      "one_sentence_episode_hook": "What if making attention cheaper actually makes your LLM slower by causing it to talk longer?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03229",
      "title": "SpANNS: Optimizing Approximate Nearest Neighbor Search for Sparse Vectors Using Near Memory Processing",
      "abstract": "Approximate Nearest Neighbor Search (ANNS) is a fundamental operation in vector databases, enabling efficient similarity search in high-dimensional spaces. While dense ANNS has been optimized using specialized hardware accelerators, sparse ANNS remains limited by CPU-based implementations, hindering scalability. This limitation is increasingly critical as hybrid retrieval systems, combining sparse and dense embeddings, become standard in Information Retrieval (IR) pipelines. We propose SpANNS, a near-memory processing architecture for sparse ANNS. SpANNS combines a hybrid inverted index with efficient query management and runtime optimizations. The architecture is built on a CXL Type-2 near-memory platform, where a specialized controller manages query parsing and cluster filtering, while compute-enabled DIMMs perform index traversal and distance computations close to the data. It achieves 15.2x to 21.6x faster execution over the state-of-the-art CPU baselines, offering scalable and efficient solutions for sparse vector search.",
      "authors": [
        "Tianqi Zhang",
        "Flavio Ponzina",
        "Tajana Rosing"
      ],
      "published_at": "2026-01-06T18:15:53",
      "categories": [
        "cs.DB",
        "cs.AR"
      ],
      "url": "http://arxiv.org/abs/2601.03229v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 1,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "other",
      "paper_type": "systems",
      "narrow_domain_flag": false,
      "sim_public": 0.3850385546684265,
      "sim_memory": 0.3980174660682678,
      "sim_negative": 0.20490583777427673,
      "broad_relevance": 0.3850385546684265,
      "momentum": 0.017490950964499406,
      "teachability": 0.85,
      "novelty_score": 0.4463194012641907,
      "evidence_score": 0.5493147180559945,
      "direct_memory_relevance": 0.3980174660682678,
      "systems_leverage": 0.5499999999999999,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.2,
      "transferability_score": 0.5599999999999999,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.34635487449145563,
      "memory_score": 0.6497230059871997,
      "quality_score": 0.5727258872223978,
      "bridge_score": 0.34635487449145563,
      "max_axis_score": 0.6497230059871997,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Hybrid retrieval pipelines are making sparse search relevant again, and this paper tackles it with a concrete near-memory architecture on CXL-style hardware where data movement is the core bottleneck. It fits the current industry interest in memory-centric AI systems beyond just dense model serving.",
      "why_not_higher": "The work is more about vector database infrastructure than mainstream model training or LLM serving, so broad AI audience pull is limited. Its impact also depends on specialized near-memory hardware adoption rather than an immediately transferable software technique.",
      "downgrade_reasons": [
        "narrowly focused on sparse ANNS",
        "hardware-specific deployment path",
        "indirect relevance to mainstream transformer workflows",
        "appears primarily as a systems benchmark paper"
      ],
      "what_would_raise_priority": "Stronger evidence that the ideas transfer to production hybrid retrieval stacks or general-purpose memory-bound AI serving systems would raise it.",
      "one_sentence_episode_hook": "If sparse retrieval is back, do we need to move vector search off the CPU and into memory itself?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03324",
      "title": "Bare-Metal Tensor Virtualization: Overcoming the Memory Wall in Edge-AI Inference on ARM64",
      "abstract": "The deployment of Large Language Models (LLMs) on edge devices is fundamentally constrained by the \"Memory Wall\" the bottleneck where data movement latency outstrips arithmetic throughput. Standard inference runtimes often incur significant overhead through high-level abstractions, dynamic dispatch, and unaligned memory access patterns. In this work, we present a novel \"Virtual Tensor Core\" architecture implemented in software, optimized specifically for ARM64 microarchitectures (Apple Silicon). By bypassing standard library containers in favor of direct memory mapping (mmap) and implementing hand-tuned NEON SIMD kernels, we achieve a form of \"Software-Defined Direct Memory Access (DMA).\" Our proposed Tensor Virtualization Layout (TVL) guarantees 100% cache line utilization for weight matrices, while our zero-copy loader eliminates initialization latency. Experimental results on a 110M parameter model demonstrate a stable throughput of >60 tokens/second on M2 hardware. While proprietary hardware accelerators (e.g., Apple AMX) can achieve higher peak throughput, our architecture provides a fully open, portable, and deterministic reference implementation for studying the memory bottleneck on general-purpose ARM silicon, meeting the 200ms psycholinguistic latency threshold without opaque dependencies.",
      "authors": [
        "Bugra Kilictas",
        "Faruk Alpay"
      ],
      "published_at": "2026-01-06T15:00:40",
      "categories": [
        "cs.CL",
        "cs.AI",
        "cs.AR",
        "cs.LG"
      ],
      "url": "http://arxiv.org/abs/2601.03324v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "systems",
      "narrow_domain_flag": false,
      "sim_public": 0.3978739082813263,
      "sim_memory": 0.559910237789154,
      "sim_negative": 0.1955457329750061,
      "broad_relevance": 0.3978739082813263,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.48185181617736816,
      "evidence_score": 0.26,
      "direct_memory_relevance": 0.559910237789154,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.4,
      "transferability_score": 0.72,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.34163994491100313,
      "memory_score": 0.5979730713367462,
      "quality_score": 0.575,
      "bridge_score": 0.34163994491100313,
      "max_axis_score": 0.5979730713367462,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Edge LLM inference on commodity ARM64 hardware is timely, and the paper squarely targets the memory-wall problem with concrete claims about cache-line utilization, zero-copy loading, and bandwidth-aware tensor layout. It could make a useful episode if the implementation is real and reproducible because it offers an open alternative to opaque vendor accelerators.",
      "why_not_higher": "The evidence appears narrow: a single 110M-parameter model on Apple M2 with ambitious systems claims but limited validation against broader workloads, baselines, and hardware. The framing is highly Apple/ARM-specific and some terminology ('software-defined DMA', 'virtual tensor core') sounds more rhetorical than established.",
      "downgrade_reasons": [
        "single-model evaluation",
        "limited hardware diversity",
        "unclear baseline rigor",
        "Apple/ARM64 specificity",
        "strong marketing-style terminology",
        "uncertain novelty versus existing mmap/zero-copy/SIMD runtime work"
      ],
      "what_would_raise_priority": "A stronger cross-model, cross-hardware evaluation with clear ablations against standard runtimes and quantified memory-bandwidth/cache effects would raise this substantially.",
      "one_sentence_episode_hook": "Can careful tensor layout, mmap, and hand-tuned NEON kernels let a plain ARM CPU punch above its weight for LLM inference by attacking the memory wall directly?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03199",
      "title": "DIP: Dynamic In-Context Planner For Diffusion Language Models",
      "abstract": "Diffusion language models (DLMs) have shown strong potential for general natural language tasks with in-context examples. However, due to the bidirectional attention mechanism, DLMs incur substantial computational cost as context length increases. This work addresses this issue with a key discovery: unlike the sequential generation in autoregressive language models (ARLMs), the diffusion generation paradigm in DLMs allows \\textit{efficient dynamic adjustment of the context} during generation. Building on this insight, we propose \\textbf{D}ynamic \\textbf{I}n-Context \\textbf{P}lanner (DIP), a context-optimization method that dynamically selects and inserts in-context examples during generation, rather than providing all examples in the prompt upfront. Results show DIP maintains generation quality while achieving up to 12.9$\\times$ inference speedup over standard inference and 1.17$\\times$ over KV cache-enhanced inference.",
      "authors": [
        "Yang Li",
        "Han Meng",
        "Chenan Wang",
        "Haipeng Chen"
      ],
      "published_at": "2026-01-06T17:24:16",
      "categories": [
        "cs.CL",
        "cs.AI"
      ],
      "url": "http://arxiv.org/abs/2601.03199v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.5605237483978271,
      "sim_memory": 0.5769320726394653,
      "sim_negative": 0.3419671654701233,
      "broad_relevance": 0.5605237483978271,
      "momentum": 0.0,
      "teachability": 0.85,
      "novelty_score": 0.4599868059158325,
      "evidence_score": 0.34,
      "direct_memory_relevance": 0.5769320726394653,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.82,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.547155145406723,
      "memory_score": 0.30907962179183956,
      "quality_score": 0.595,
      "bridge_score": 0.30907962179183956,
      "max_axis_score": 0.547155145406723,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Adjacent",
        "Systems",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Diffusion language models are still emerging, and this paper offers a practical serving-side idea: dynamically changing which in-context examples are present during generation to cut inference cost. That makes it timely as people explore non-autoregressive alternatives and prompt-time efficiency tricks.",
      "why_not_higher": "The memory/storage angle is indirect: this is mainly context management for DLM inference, not a concrete KV-cache, paging, offload, or bandwidth systems paper. Its impact also depends on diffusion language models becoming more mainstream, and the abstract alone does not establish broad realistic deployment evidence.",
      "downgrade_reasons": [
        "memory connection is adjacent rather than core",
        "depends on adoption of diffusion language models",
        "unclear realism of workloads and hardware evaluation from abstract",
        "speedup claim may be architecture-specific"
      ],
      "what_would_raise_priority": "Show strong results on widely used DLMs and realistic serving setups, with clear comparisons to standard prompt compression, retrieval, and cache-aware inference baselines.",
      "one_sentence_episode_hook": "What if diffusion language models could rewrite their own prompt on the fly, keeping quality while making long-context inference dramatically cheaper?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03204",
      "title": "InfiAgent: An Infinite-Horizon Framework for General-Purpose Autonomous Agents",
      "abstract": "LLM agents can reason and use tools, but they often break down on long-horizon tasks due to unbounded context growth and accumulated errors. Common remedies such as context compression or retrieval-augmented prompting introduce trade-offs between information fidelity and reasoning stability. We present InfiAgent, a general-purpose framework that keeps the agent's reasoning context strictly bounded regardless of task duration by externalizing persistent state into a file-centric state abstraction. At each step, the agent reconstructs context from a workspace state snapshot plus a fixed window of recent actions. Experiments on DeepResearch and an 80-paper literature review task show that, without task-specific fine-tuning, InfiAgent with a 20B open-source model is competitive with larger proprietary systems and maintains substantially higher long-horizon coverage than context-centric baselines. These results support explicit state externalization as a practical foundation for stable long-horizon agents. Github Repo:https://github.com/ChenglinPoly/infiAgent",
      "authors": [
        "Chenglin Yu",
        "Yuchen Wang",
        "Songmiao Wang",
        "Hongxia Yang",
        "Ming Li"
      ],
      "published_at": "2026-01-06T17:35:57",
      "categories": [
        "cs.AI",
        "cs.MA"
      ],
      "url": "http://arxiv.org/abs/2601.03204v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 1,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.5043788552284241,
      "sim_memory": 0.3724733591079712,
      "sim_negative": 0.22019587457180023,
      "broad_relevance": 0.5043788552284241,
      "momentum": 0.017490950964499406,
      "teachability": 0.75,
      "novelty_score": 0.5200152099132538,
      "evidence_score": 0.4893147180559946,
      "direct_memory_relevance": 0.3724733591079712,
      "systems_leverage": 0.25,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.9199999999999999,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.5932113359568144,
      "memory_score": 0.2830597738991107,
      "quality_score": 0.6027258872223977,
      "bridge_score": 0.2830597738991107,
      "max_axis_score": 0.5932113359568144,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Adjacent",
        "Bridge",
        "Systems",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Long-horizon agents are a live topic, and this paper offers a practical alternative to ever-growing context windows by externalizing persistent state into files. That design is timely because many teams are hitting reliability and cost limits in agent workflows built around pure prompt history.",
      "why_not_higher": "The memory angle is architectural rather than a direct KV-cache, paging, or bandwidth systems result, so it is adjacent rather than core to the podcast's storage lens. Evidence is promising but still limited to agent benchmarks and literature-review style workloads rather than broad real-world deployments.",
      "downgrade_reasons": [
        "Memory/storage link is indirect",
        "Evaluation breadth is limited",
        "Agent framework space is crowded",
        "Not a direct serving or hardware result"
      ],
      "what_would_raise_priority": "Stronger evidence across diverse real-world agent tasks plus clear comparisons on cost, latency, failure recovery, and context-length scaling would raise it to cover-now territory.",
      "one_sentence_episode_hook": "What if the way to make agents run for days isn't a bigger context window at all, but treating state like a filesystem and rebuilding the mind from snapshots?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03417",
      "title": "Implicit Graph, Explicit Retrieval: Towards Efficient and Interpretable Long-horizon Memory for Large Language Models",
      "abstract": "Long-horizon applications increasingly require large language models (LLMs) to answer queries when relevant evidence is sparse and dispersed across very long contexts. Existing memory systems largely follow two paradigms: explicit structured memories offer interpretability but often become brittle under long-context overload, while latent memory mechanisms are efficient and stable yet difficult to inspect. We propose LatentGraphMem, a memory framework that combines implicit graph memory with explicit subgraph retrieval. LatentGraphMem stores a graph-structured memory in latent space for stability and efficiency, and exposes a task-specific subgraph retrieval interface that returns a compact symbolic subgraph under a fixed budget for downstream reasoning and human inspection. During training, an explicit graph view is materialized to interface with a frozen reasoner for question-answering supervision. At inference time, retrieval is performed in latent space and only the retrieved subgraph is externalized. Experiments on long-horizon benchmarks across multiple model scales show that LatentGraphMem consistently outperforms representative explicit-graph and latent-memory baselines, while enabling parameter-efficient adaptation and flexible scaling to larger reasoners without introducing large symbolic artifacts.",
      "authors": [
        "Xin Zhang",
        "Kailai Yang",
        "Hao Li",
        "Chenyue Li",
        "Qiyu Wei",
        "Sophia Ananiadou"
      ],
      "published_at": "2026-01-06T21:10:10",
      "categories": [
        "cs.CL"
      ],
      "url": "http://arxiv.org/abs/2601.03417v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.4132891595363617,
      "sim_memory": 0.4329555630683899,
      "sim_negative": 0.25163236260414124,
      "broad_relevance": 0.4132891595363617,
      "momentum": 0.0,
      "teachability": 0.85,
      "novelty_score": 0.5272353291511536,
      "evidence_score": 0.36000000000000004,
      "direct_memory_relevance": 0.4329555630683899,
      "systems_leverage": 0.25,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.82,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.49307204723358156,
      "memory_score": 0.17288666892051696,
      "quality_score": 0.595,
      "bridge_score": 0.17288666892051696,
      "max_axis_score": 0.49307204723358156,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Adjacent",
        "Bridge",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Long-context LLMs still struggle when relevant evidence is sparse and scattered, and this paper offers an interpretable retrieval-memory hybrid that speaks to a live design question in agentic and long-horizon systems. The latent-plus-explicit graph split is a potentially useful framing for balancing efficiency, stability, and inspectability.",
      "why_not_higher": "The memory angle is mostly architectural and retrieval-centric rather than a concrete systems contribution around KV cache, offload, paging, or bandwidth. Evidence appears benchmark-driven on long-horizon QA rather than showing clear impact on realistic serving stacks or production-scale deployments.",
      "downgrade_reasons": [
        "memory connection is adjacent, not core systems memory",
        "appears centered on benchmarked long-horizon QA",
        "limited evidence of hardware/serving implications",
        "crowded area of long-context and retrieval-memory hybrids"
      ],
      "what_would_raise_priority": "A stronger case that this method changes real LLM serving or agent memory design at scale, especially with concrete efficiency and latency wins on realistic deployments, would raise priority.",
      "one_sentence_episode_hook": "Can LLMs keep long-horizon memories in a latent graph yet reveal only the tiny symbolic subgraph you need\u2014getting both efficiency and interpretability without drowning in context?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03017",
      "title": "MMFormalizer: Multimodal Autoformalization in the Wild",
      "abstract": "Autoformalization, which translates natural language mathematics into formal statements to enable machine reasoning, faces fundamental challenges in the wild due to the multimodal nature of the physical world, where physics requires inferring hidden constraints (e.g., mass or energy) from visual elements. To address this, we propose MMFormalizer, which extends autoformalization beyond text by integrating adaptive grounding with entities from real-world mathematical and physical domains. MMFormalizer recursively constructs formal propositions from perceptually grounded primitives through recursive grounding and axiom composition, with adaptive recursive termination ensuring that every abstraction is supported by visual evidence and anchored in dimensional or axiomatic grounding. We evaluate MMFormalizer on a new benchmark, PhyX-AF, comprising 115 curated samples from MathVerse, PhyX, Synthetic Geometry, and Analytic Geometry, covering diverse multimodal autoformalization tasks. Results show that frontier models such as GPT-5 and Gemini-3-Pro achieve the highest compile and semantic accuracy, with GPT-5 excelling in physical reasoning, while geometry remains the most challenging domain. Overall, MMFormalizer provides a scalable framework for unified multimodal autoformalization, bridging perception and formal reasoning. To the best of our knowledge, this is the first multimodal autoformalization method capable of handling classical mechanics (derived from the Hamiltonian), as well as relativity, quantum mechanics, and thermodynamics. More details are available on our project page: MMFormalizer.github.io",
      "authors": [
        "Jing Xiong",
        "Qi Han",
        "Yunta Hsieh",
        "Hui Shen",
        "Huajian Xin",
        "Chaofan Tao",
        "Chenyang Zhao",
        "Hengyuan Zhang",
        "Taiqiang Wu",
        "Zhen Zhang"
      ],
      "published_at": "2026-01-06T13:42:51",
      "categories": [
        "cs.CL"
      ],
      "url": "http://arxiv.org/abs/2601.03017v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.44381266832351685,
      "sim_memory": 0.29899269342422485,
      "sim_negative": 0.3236953020095825,
      "broad_relevance": 0.44381266832351685,
      "momentum": 0.0,
      "teachability": 0.85,
      "novelty_score": 0.6132620275020599,
      "evidence_score": 0.34,
      "direct_memory_relevance": 0.29899269342422485,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.72,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.515133104622364,
      "memory_score": 0.005697808027267437,
      "quality_score": 0.595,
      "bridge_score": 0.005697808027267437,
      "max_axis_score": 0.515133104622364,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Application",
        "Bridge"
      ],
      "status": "Monitor",
      "why_now": "Multimodal reasoning is a live topic, and this paper pushes autoformalization beyond text into diagrams and physics scenes, which is a genuinely interesting direction for AI audiences. It also introduces a benchmark that could help expose where frontier multimodal models still fail at grounding formal reasoning.",
      "why_not_higher": "The contribution is mostly a task framework and benchmark for a specialized reasoning setting, not something that obviously changes how mainstream AI systems are built or served. Evidence is promising but small-scale, with 115 curated samples and evaluation centered on frontier model performance rather than broad, reproducible systems validation.",
      "downgrade_reasons": [
        "No direct memory/storage mechanism",
        "Primarily a narrow multimodal formalization task",
        "Benchmark size is limited",
        "Unclear downstream impact on mainstream model building or serving"
      ],
      "what_would_raise_priority": "A larger benchmark plus evidence that the formalization pipeline materially improves real theorem proving, scientific assistants, or multimodal agent reliability would raise its priority.",
      "one_sentence_episode_hook": "Can an AI look at a physics or geometry diagram and turn it into machine-checkable mathematics, not just plausible text?",
      "source": "digest",
      "added": "",
      "issue_number": null
    }
  ],
  "public": [
    {
      "arxiv_id": "2601.03027",
      "title": "Reducing Hallucinations in LLMs via Factuality-Aware Preference Learning",
      "abstract": "Preference alignment methods such as RLHF and Direct Preference Optimization (DPO) improve instruction following, but they can also reinforce hallucinations when preference judgments reward fluency and confidence over factual correctness. We introduce F-DPO (Factuality-aware Direct Preference Optimization), a simple extension of DPO that uses only binary factuality labels. F-DPO (i) applies a label-flipping transformation that corrects misordered preference pairs so the chosen response is never less factual than the rejected one, and (ii) adds a factuality-aware margin that emphasizes pairs with clear correctness differences, while reducing to standard DPO when both responses share the same factuality. We construct factuality-aware preference data by augmenting DPO pairs with binary factuality indicators and synthetic hallucinated variants. Across seven open-weight LLMs (1B-14B), F-DPO consistently improves factuality and reduces hallucination rates relative to both base models and standard DPO. On Qwen3-8B, F-DPO reduces hallucination rates by five times (from 0.424 to 0.084) while improving factuality scores by 50 percent (from 5.26 to 7.90). F-DPO also generalizes to out-of-distribution benchmarks: on TruthfulQA, Qwen2.5-14B achieves plus 17 percent MC1 accuracy (0.500 to 0.585) and plus 49 percent MC2 accuracy (0.357 to 0.531). F-DPO requires no auxiliary reward model, token-level annotations, or multi-stage training.",
      "authors": [
        "Sindhuja Chaduvula",
        "Ahmed Y. Radwan",
        "Azib Farooq",
        "Yani Ioannou",
        "Shaina Raza"
      ],
      "published_at": "2026-01-06T14:01:34",
      "categories": [
        "cs.CL"
      ],
      "url": "http://arxiv.org/abs/2601.03027v2",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 2,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.3803938627243042,
      "sim_memory": 0.28131216764450073,
      "sim_negative": 0.21838413178920746,
      "broad_relevance": 0.3803938627243042,
      "momentum": 0.031333575283049045,
      "teachability": 0.75,
      "novelty_score": 0.5752854645252228,
      "evidence_score": 0.6198612288668109,
      "direct_memory_relevance": 0.28131216764450073,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.94,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.6231568778827061,
      "memory_score": 0.013576997757367537,
      "quality_score": 0.6189444915467244,
      "bridge_score": 0.013576997757367537,
      "max_axis_score": 0.6231568778827061,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Training"
      ],
      "status": "Cover now",
      "why_now": "Hallucination reduction is a live concern for anyone shipping LLMs, and this proposes a simple drop-in modification to DPO rather than a heavyweight new alignment stack. The reported gains across multiple open-weight models and on out-of-distribution truthfulness benchmarks make it timely for practitioners evaluating post-training recipes.",
      "why_not_higher": "This is not a memory/storage paper, and much of the contribution is an incremental alignment objective plus dataset relabeling rather than a broader systems shift. The evidence is strong for factuality benchmarks, but the paper does not yet show downstream product impact or tradeoffs on helpfulness, style, and preference quality in real deployments.",
      "downgrade_reasons": [
        "No direct memory or storage mechanism",
        "Primarily an alignment/post-training method",
        "Benchmark-heavy with limited deployment evidence",
        "Unclear tradeoffs versus general preference optimization objectives"
      ],
      "what_would_raise_priority": "Independent replication on widely used post-training stacks plus clearer results on real assistant workloads and robustness-helpfulness tradeoffs would raise it further.",
      "one_sentence_episode_hook": "What if standard preference tuning is teaching LLMs to sound right instead of be right\u2014and a tiny DPO fix can cut hallucinations 5x?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03190",
      "title": "Maximizing Local Entropy Where It Matters: Prefix-Aware Localized LLM Unlearning",
      "abstract": "Machine unlearning aims to forget sensitive knowledge from Large Language Models (LLMs) while maintaining general utility. However, existing approaches typically treat all tokens in a response indiscriminately and enforce uncertainty over the entire vocabulary. This global treatment results in unnecessary utility degradation and extends optimization to content-agnostic regions. To address these limitations, we propose PALU (Prefix-Aware Localized Unlearning), a framework driven by a local entropy maximization objective across both temporal and vocabulary dimensions. PALU reveals that (i) suppressing the sensitive prefix alone is sufficient to sever the causal generation link, and (ii) flattening only the top-$k$ logits is adequate to maximize uncertainty in the critical subspace. These findings allow PALU to avoid redundant optimization across the full vocabulary and parameter space while minimizing collateral damage to general model performance. Extensive experiments validate that PALU achieves superior forgetting efficacy and utility preservation compared to state-of-the-art baselines.",
      "authors": [
        "Naixin Zhai",
        "Pengyang Shao",
        "Binbin Zheng",
        "Yonghui Yang",
        "Fei Shen",
        "Long Bai",
        "Xun Yang"
      ],
      "published_at": "2026-01-06T17:10:48",
      "categories": [
        "cs.CL"
      ],
      "url": "http://arxiv.org/abs/2601.03190v2",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 5,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.4937289357185364,
      "sim_memory": 0.5283973217010498,
      "sim_negative": 0.2787211835384369,
      "broad_relevance": 0.4937289357185364,
      "momentum": 0.06108604879161034,
      "teachability": 0.85,
      "novelty_score": 0.5714424252510071,
      "evidence_score": 0.6091759469228055,
      "direct_memory_relevance": 0.5283973217010498,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.8599999999999999,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5929286462999549,
      "memory_score": 0.05602031014105155,
      "quality_score": 0.6666703787691222,
      "bridge_score": 0.05602031014105155,
      "max_axis_score": 0.5929286462999549,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Training"
      ],
      "status": "Monitor",
      "why_now": "LLM unlearning is an active topic because labs increasingly need practical ways to remove sensitive knowledge without wrecking model utility. This paper offers a sharper recipe: only disrupt the sensitive prefix and only flatten the top-k logits, which is an intuitively teachable refinement over blunt global unlearning.",
      "why_not_higher": "It is not a memory/storage paper in the podcast's first-class sense, and its main contribution is a more targeted unlearning objective rather than a broadly system-changing training or serving method. The abstract claims strong results, but does not clearly establish realism across frontier-scale models or deployment settings.",
      "downgrade_reasons": [
        "memory/storage link absent",
        "primarily an unlearning-method paper",
        "unclear validation on frontier-scale models",
        "may be incremental relative to recent selective-editing/unlearning work"
      ],
      "what_would_raise_priority": "Evidence on larger production-relevant models plus clear comparisons on compute cost, stability, and transfer to real deletion workflows would raise priority.",
      "one_sentence_episode_hook": "What if effective LLM unlearning doesn't require scrambling everything\u2014just the sensitive prefix and a tiny top-k slice of the output distribution?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03233",
      "title": "LTX-2: Efficient Joint Audio-Visual Foundation Model",
      "abstract": "Recent text-to-video diffusion models can generate compelling video sequences, yet they remain silent -- missing the semantic, emotional, and atmospheric cues that audio provides. We introduce LTX-2, an open-source foundational model capable of generating high-quality, temporally synchronized audiovisual content in a unified manner. LTX-2 consists of an asymmetric dual-stream transformer with a 14B-parameter video stream and a 5B-parameter audio stream, coupled through bidirectional audio-video cross-attention layers with temporal positional embeddings and cross-modality AdaLN for shared timestep conditioning. This architecture enables efficient training and inference of a unified audiovisual model while allocating more capacity for video generation than audio generation. We employ a multilingual text encoder for broader prompt understanding and introduce a modality-aware classifier-free guidance (modality-CFG) mechanism for improved audiovisual alignment and controllability. Beyond generating speech, LTX-2 produces rich, coherent audio tracks that follow the characters, environment, style, and emotion of each scene -- complete with natural background and foley elements. In our evaluations, the model achieves state-of-the-art audiovisual quality and prompt adherence among open-source systems, while delivering results comparable to proprietary models at a fraction of their computational cost and inference time. All model weights and code are publicly released.",
      "authors": [
        "Yoav HaCohen",
        "Benny Brazowski",
        "Nisan Chiprut",
        "Yaki Bitterman",
        "Andrew Kvochko",
        "Avishai Berkowitz",
        "Daniel Shalem",
        "Daphna Lifschitz",
        "Dudu Moshe",
        "Eitan Porat"
      ],
      "published_at": "2026-01-06T18:24:41",
      "categories": [
        "cs.CV"
      ],
      "url": "http://arxiv.org/abs/2601.03233v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 31,
      "influential_citation_count": 0,
      "scope_bucket": "application",
      "domain_bucket": "vision",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.39005953073501587,
      "sim_memory": 0.3181798458099365,
      "sim_negative": 0.3072802722454071,
      "broad_relevance": 0.39005953073501587,
      "momentum": 0.2604762596823727,
      "teachability": 0.6,
      "novelty_score": 0.5765587687492371,
      "evidence_score": 0.6799999999999999,
      "direct_memory_relevance": 0.3181798458099365,
      "systems_leverage": 0.25,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.43000000000000005,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.6005969264693648,
      "memory_score": 0.15445395374298096,
      "quality_score": 0.595,
      "bridge_score": 0.15445395374298096,
      "max_axis_score": 0.6005969264693648,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [
        "compound_window:90d"
      ],
      "time_window": "90d",
      "compound_window_boost": 0.0315,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Training",
        "Inference",
        "Application"
      ],
      "status": "Monitor",
      "why_now": "Open-source, jointly generated audio-video foundation models are timely because the field is moving from silent video generation to full multimodal media generation. A broad AI audience will care about whether unified audiovisual generation can approach proprietary systems while being practical to run and inspect.",
      "why_not_higher": "The main contribution is a strong multimodal generative model rather than a broadly reusable systems idea, and the memory/storage angle is weak. It also risks being covered as a product-capability milestone more than a paper that changes how most AI teams build systems.",
      "downgrade_reasons": [
        "memory/storage connection is indirect",
        "primarily a model-and-benchmark advance",
        "unclear breadth of transfer beyond audiovisual generation",
        "evidence may be strong within domain but not clearly transformative for general AI systems"
      ],
      "what_would_raise_priority": "Clear ablations and deployment evidence showing that the asymmetric dual-stream design materially changes training or serving efficiency for large multimodal models beyond this specific audiovisual task would raise priority.",
      "one_sentence_episode_hook": "What does it take to make open-source video models stop being silent\u2014and can a single asymmetric transformer generate both the scene and its soundtrack without blowing up cost?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.03164",
      "title": "WebAnchor: Anchoring Agent Planning to Stabilize Long-Horizon Web Reasoning",
      "abstract": "Large Language Model(LLM)-based agents have shown strong capabilities in web information seeking, with reinforcement learning (RL) becoming a key optimization paradigm. However, planning remains a bottleneck, as existing methods struggle with long-horizon strategies. Our analysis reveals a critical phenomenon, plan anchor, where the first reasoning step disproportionately impacts downstream behavior in long-horizon web reasoning tasks. Current RL algorithms, fail to account for this by uniformly distributing rewards across the trajectory. To address this, we propose Anchor-GRPO, a two-stage RL framework that decouples planning and execution. In Stage 1, the agent optimizes its first-step planning using fine-grained rubrics derived from self-play experiences and human calibration. In Stage 2, execution is aligned with the initial plan through sparse rewards, ensuring stable and efficient tool usage. We evaluate Anchor-GRPO on four benchmarks: BrowseComp, BrowseComp-Zh, GAIA, and XBench-DeepSearch. Across models from 3B to 30B, Anchor-GRPO outperforms baseline GRPO and First-step GRPO, improving task success and tool efficiency. Notably, WebAnchor-30B achieves 46.0% pass@1 on BrowseComp and 76.4% on GAIA. Anchor-GRPO also demonstrates strong scalability, getting higher accuracy as model size and context length increase.",
      "authors": [
        "Xinmiao Yu",
        "Liwen Zhang",
        "Xiaocheng Feng",
        "Yong Jiang",
        "Bing Qin",
        "Pengjun Xie",
        "Jingren Zhou"
      ],
      "published_at": "2026-01-06T16:36:40",
      "categories": [
        "cs.CL"
      ],
      "url": "http://arxiv.org/abs/2601.03164v2",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 2,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.5007184147834778,
      "sim_memory": 0.32674306631088257,
      "sim_negative": 0.26186734437942505,
      "broad_relevance": 0.5007184147834778,
      "momentum": 0.031333575283049045,
      "teachability": 0.85,
      "novelty_score": 0.5116108357906342,
      "evidence_score": 0.559861228866811,
      "direct_memory_relevance": 0.32674306631088257,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.7799999999999999,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.58970304919027,
      "memory_score": 0.0272062673572821,
      "quality_score": 0.6389444915467244,
      "bridge_score": 0.0272062673572821,
      "max_axis_score": 0.58970304919027,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Training",
        "Inference",
        "Application"
      ],
      "status": "Monitor",
      "why_now": "Web agents and long-horizon reasoning are active topics, and this paper offers a concrete training idea: explicitly optimize the first planning step because it anchors the whole trajectory. That is timely for anyone trying to make browser or tool-using agents more reliable.",
      "why_not_higher": "This is not a memory/storage paper in any meaningful systems sense, and its core contribution is still fairly specific to web reasoning agents trained with RL. The claimed insight is interesting, but it does not yet clearly change mainstream LLM building or serving outside this niche.",
      "downgrade_reasons": [
        "memory/storage link is absent",
        "agent-RL niche rather than broad systems impact",
        "benchmark-driven with limited proof of generality beyond web tasks",
        "does not clearly alter serving stack or model architecture"
      ],
      "what_would_raise_priority": "Show the anchor effect and the two-stage training scheme transferring robustly to broader tool-use and agent settings beyond web benchmarks, ideally with stronger ablations and external replications.",
      "one_sentence_episode_hook": "What if the fate of a long-running web agent is mostly decided by its very first thought\u2014and RL has been rewarding the wrong part of the trajectory all along?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.02972",
      "title": "Correct, Concise and Complete: Multi-stage Training For Adaptive Reasoning",
      "abstract": "The reasoning capabilities of large language models (LLMs) have improved substantially through increased test-time computation, typically in the form of intermediate tokens known as chain-of-thought (CoT). However, CoT often becomes unnecessarily long, increasing computation cost without actual accuracy gains or sometimes even degrading performance, a phenomenon known as ``overthinking''. We propose a multi-stage efficient reasoning method that combines supervised fine-tuning -- via rejection sampling or reasoning trace reformatting -- with reinforcement learning using an adaptive length penalty. We introduce a lightweight reward function that penalizes tokens generated after the first correct answer but encouraging self-verification only when beneficial. We conduct a holistic evaluation across seven diverse reasoning tasks, analyzing the accuracy-response length trade-off. Our approach reduces response length by an average of 28\\% for 8B models and 40\\% for 32B models, while incurring only minor performance drops of 1.6 and 2.5 points, respectively. Despite its conceptual simplicity, it achieves a superior trade-off compared to more complex state-of-the-art efficient reasoning methods, scoring 76.6, in terms of the area under the Overthinking-Adjusted Accuracy curve ($\\text{AUC}_{\\text{OAA}}$) -- 5 points above the base model and 2.5 points above the second-best approach.",
      "authors": [
        "Nathana\u00ebl Carraz Rakotonirina",
        "Ren Pang",
        "Neha Anna John",
        "Michael Bohlke-Schneider",
        "Momchil Hardalov"
      ],
      "published_at": "2026-01-06T12:31:51",
      "categories": [
        "cs.CL",
        "cs.AI"
      ],
      "url": "http://arxiv.org/abs/2601.02972v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 1,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.4297025799751282,
      "sim_memory": 0.41212713718414307,
      "sim_negative": 0.19631147384643555,
      "broad_relevance": 0.4297025799751282,
      "momentum": 0.017490950964499406,
      "teachability": 0.85,
      "novelty_score": 0.5553911328315735,
      "evidence_score": 0.5493147180559945,
      "direct_memory_relevance": 0.41212713718414307,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.72,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5961148418185735,
      "memory_score": 0.10795590732196225,
      "quality_score": 0.5727258872223978,
      "bridge_score": 0.10795590732196225,
      "max_axis_score": 0.5961148418185735,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,