@@ -496,46 +496,55 @@ <h2>Functions<a class="headerlink" href="#functions" title="Link to this heading
496
496
< tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.dec_max_nreg " title ="tilelang.language.builtin.dec_max_nreg "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> dec_max_nreg</ span > </ code > </ a > (reg_count)</ p > </ td >
497
497
< td > < p > Decrement the maximum number of registers to use.</ p > </ td >
498
498
</ tr >
499
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc " title ="tilelang.language.builtin.annotate_producer_reg_dealloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> annotate_producer_reg_dealloc</ span > </ code > </ a > ([reg_count])</ p > </ td >
500
+ < td > < p > Annotate the producer reg dealloc.</ p > </ td >
501
+ </ tr >
502
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc " title ="tilelang.language.builtin.annotate_consumer_reg_alloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> annotate_consumer_reg_alloc</ span > </ code > </ a > ([reg_count])</ p > </ td >
503
+ < td > < p > Annotate the consumer reg alloc.</ p > </ td >
504
+ </ tr >
499
505
< tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.no_set_max_nreg " title ="tilelang.language.builtin.no_set_max_nreg "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> no_set_max_nreg</ span > </ code > </ a > ()</ p > </ td >
500
506
< td > < p > Disable the maximum register limit setting.</ p > </ td >
501
507
</ tr >
502
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="tilelang.language.builtin.mbarrier_wait_parity "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity</ span > </ code > </ a > (mbarrier, parity)</ p > </ td >
508
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc " title ="tilelang.language.builtin.disable_warp_group_reg_alloc "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> disable_warp_group_reg_alloc</ span > </ code > </ a > ()</ p > </ td >
509
+ < td > < p > Disable the warp group reg alloc.</ p > </ td >
510
+ </ tr >
511
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="tilelang.language.builtin.mbarrier_wait_parity "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity</ span > </ code > </ a > (mbarrier, parity)</ p > </ td >
503
512
< td > < p > Wait for memory barrier parity condition.</ p > </ td >
504
513
</ tr >
505
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive " title ="tilelang.language.builtin.mbarrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_arrive</ span > </ code > </ a > (mbarrier)</ p > </ td >
514
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive " title ="tilelang.language.builtin.mbarrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_arrive</ span > </ code > </ a > (mbarrier)</ p > </ td >
506
515
< td > < p > Arrive at memory barrier.</ p > </ td >
507
516
</ tr >
508
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx " title ="tilelang.language.builtin.mbarrier_expect_tx "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx</ span > </ code > </ a > (*args)</ p > </ td >
517
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx " title ="tilelang.language.builtin.mbarrier_expect_tx "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx</ span > </ code > </ a > (*args)</ p > </ td >
509
518
< td > < p > Set expected transaction count for memory barrier.</ p > </ td >
510
519
</ tr >
511
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.wait_wgmma " title ="tilelang.language.builtin.wait_wgmma "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> wait_wgmma</ span > </ code > </ a > (id)</ p > </ td >
520
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.wait_wgmma " title ="tilelang.language.builtin.wait_wgmma "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> wait_wgmma</ span > </ code > </ a > (id)</ p > </ td >
512
521
< td > < p > Wait for WGMMA (Warp Group Matrix Multiply-Accumulate) operations to complete.</ p > </ td >
513
522
</ tr >
514
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_wait " title ="tilelang.language.builtin.barrier_wait "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_wait</ span > </ code > </ a > (barrier_id[, parity])</ p > </ td >
523
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_wait " title ="tilelang.language.builtin.barrier_wait "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_wait</ span > </ code > </ a > (barrier_id[, parity])</ p > </ td >
515
524
< td > < p > Wait for a memory barrier to complete.</ p > </ td >
516
525
</ tr >
517
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_arrive " title ="tilelang.language.builtin.barrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_arrive</ span > </ code > </ a > (barrier_id)</ p > </ td >
526
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.barrier_arrive " title ="tilelang.language.builtin.barrier_arrive "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> barrier_arrive</ span > </ code > </ a > (barrier_id)</ p > </ td >
518
527
< td > < p > Arrive at a memory barrier.</ p > </ td >
519
528
</ tr >
520
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_xor " title ="tilelang.language.builtin.shfl_xor "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_xor</ span > </ code > </ a > (value, offset)</ p > </ td >
529
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_xor " title ="tilelang.language.builtin.shfl_xor "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_xor</ span > </ code > </ a > (value, offset)</ p > </ td >
521
530
< td > < p > Perform a shuffle operation with XOR offset.</ p > </ td >
522
531
</ tr >
523
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_down " title ="tilelang.language.builtin.shfl_down "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_down</ span > </ code > </ a > (value, offset)</ p > </ td >
532
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_down " title ="tilelang.language.builtin.shfl_down "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_down</ span > </ code > </ a > (value, offset)</ p > </ td >
524
533
< td > < p > Perform a shuffle operation with down offset.</ p > </ td >
525
534
</ tr >
526
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_up " title ="tilelang.language.builtin.shfl_up "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_up</ span > </ code > </ a > (value, offset)</ p > </ td >
535
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.shfl_up " title ="tilelang.language.builtin.shfl_up "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> shfl_up</ span > </ code > </ a > (value, offset)</ p > </ td >
527
536
< td > < p > Perform a shuffle operation with up offset.</ p > </ td >
528
537
</ tr >
529
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_threads " title ="tilelang.language.builtin.sync_threads "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_threads</ span > </ code > </ a > ()</ p > </ td >
538
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_threads " title ="tilelang.language.builtin.sync_threads "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_threads</ span > </ code > </ a > ()</ p > </ td >
530
539
< td > < p > Synchronize all threads in a warp.</ p > </ td >
531
540
</ tr >
532
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_thread_partial " title ="tilelang.language.builtin.sync_thread_partial "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_thread_partial</ span > </ code > </ a > (barrier_id)</ p > </ td >
541
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_thread_partial " title ="tilelang.language.builtin.sync_thread_partial "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_thread_partial</ span > </ code > </ a > (barrier_id)</ p > </ td >
533
542
< td > < p > Synchronize threads within a warp.</ p > </ td >
534
543
</ tr >
535
- < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_global " title ="tilelang.language.builtin.sync_global "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_global</ span > </ code > </ a > ()</ p > </ td >
544
+ < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_global " title ="tilelang.language.builtin.sync_global "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_global</ span > </ code > </ a > ()</ p > </ td >
536
545
< td > < p > Synchronize all threads in a block.</ p > </ td >
537
546
</ tr >
538
- < tr class ="row-even "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_grid " title ="tilelang.language.builtin.sync_grid "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_grid</ span > </ code > </ a > ()</ p > </ td >
547
+ < tr class ="row-odd "> < td > < p > < a class ="reference internal " href ="#tilelang.language.builtin.sync_grid " title ="tilelang.language.builtin.sync_grid "> < code class ="xref py py-obj docutils literal notranslate "> < span class ="pre "> sync_grid</ span > </ code > </ a > ()</ p > </ td >
539
548
< td > < p > Synchronize all threads in a grid.</ p > </ td >
540
549
</ tr >
541
550
</ tbody >
@@ -718,12 +727,40 @@ <h2>Module Contents<a class="headerlink" href="#module-contents" title="Link to
718
727
</ dl >
719
728
</ dd > </ dl >
720
729
730
+ < dl class ="py function ">
731
+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.annotate_producer_reg_dealloc ">
732
+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> annotate_producer_reg_dealloc</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> reg_count</ span > </ span > < span class ="o "> < span class ="pre "> =</ span > </ span > < span class ="default_value "> < span class ="pre "> 24</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc " title ="Link to this definition "> ¶</ a > </ dt >
733
+ < dd > < p > Annotate the producer reg dealloc.</ p >
734
+ < dl class ="field-list simple ">
735
+ < dt class ="field-odd "> Parameters< span class ="colon "> :</ span > </ dt >
736
+ < dd class ="field-odd "> < p > < strong > reg_count</ strong > (< em > int</ em > )</ p >
737
+ </ dd >
738
+ </ dl >
739
+ </ dd > </ dl >
740
+
741
+ < dl class ="py function ">
742
+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.annotate_consumer_reg_alloc ">
743
+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> annotate_consumer_reg_alloc</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> reg_count</ span > </ span > < span class ="o "> < span class ="pre "> =</ span > </ span > < span class ="default_value "> < span class ="pre "> 240</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc " title ="Link to this definition "> ¶</ a > </ dt >
744
+ < dd > < p > Annotate the consumer reg alloc.</ p >
745
+ < dl class ="field-list simple ">
746
+ < dt class ="field-odd "> Parameters< span class ="colon "> :</ span > </ dt >
747
+ < dd class ="field-odd "> < p > < strong > reg_count</ strong > (< em > int</ em > )</ p >
748
+ </ dd >
749
+ </ dl >
750
+ </ dd > </ dl >
751
+
721
752
< dl class ="py function ">
722
753
< dt class ="sig sig-object py " id ="tilelang.language.builtin.no_set_max_nreg ">
723
754
< span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> no_set_max_nreg</ span > </ span > < span class ="sig-paren "> (</ span > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.no_set_max_nreg " title ="Link to this definition "> ¶</ a > </ dt >
724
755
< dd > < p > Disable the maximum register limit setting.</ p >
725
756
</ dd > </ dl >
726
757
758
+ < dl class ="py function ">
759
+ < dt class ="sig sig-object py " id ="tilelang.language.builtin.disable_warp_group_reg_alloc ">
760
+ < span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> disable_warp_group_reg_alloc</ span > </ span > < span class ="sig-paren "> (</ span > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc " title ="Link to this definition "> ¶</ a > </ dt >
761
+ < dd > < p > Disable the warp group reg alloc.</ p >
762
+ </ dd > </ dl >
763
+
727
764
< dl class ="py function ">
728
765
< dt class ="sig sig-object py " id ="tilelang.language.builtin.mbarrier_wait_parity ">
729
766
< span class ="sig-prename descclassname "> < span class ="pre "> tilelang.language.builtin.</ span > </ span > < span class ="sig-name descname "> < span class ="pre "> mbarrier_wait_parity</ span > </ span > < span class ="sig-paren "> (</ span > < em class ="sig-param "> < span class ="n "> < span class ="pre "> mbarrier</ span > </ span > </ em > , < em class ="sig-param "> < span class ="n "> < span class ="pre "> parity</ span > </ span > </ em > < span class ="sig-paren "> )</ span > < a class ="headerlink " href ="#tilelang.language.builtin.mbarrier_wait_parity " title ="Link to this definition "> ¶</ a > </ dt >
@@ -1023,7 +1060,10 @@ <h2>Module Contents<a class="headerlink" href="#module-contents" title="Link to
1023
1060
< li > < a class ="reference internal " href ="#tilelang.language.builtin.set_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> set_max_nreg()</ span > </ code > </ a > </ li >
1024
1061
< li > < a class ="reference internal " href ="#tilelang.language.builtin.inc_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> inc_max_nreg()</ span > </ code > </ a > </ li >
1025
1062
< li > < a class ="reference internal " href ="#tilelang.language.builtin.dec_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> dec_max_nreg()</ span > </ code > </ a > </ li >
1063
+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_producer_reg_dealloc "> < code class ="docutils literal notranslate "> < span class ="pre "> annotate_producer_reg_dealloc()</ span > </ code > </ a > </ li >
1064
+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.annotate_consumer_reg_alloc "> < code class ="docutils literal notranslate "> < span class ="pre "> annotate_consumer_reg_alloc()</ span > </ code > </ a > </ li >
1026
1065
< li > < a class ="reference internal " href ="#tilelang.language.builtin.no_set_max_nreg "> < code class ="docutils literal notranslate "> < span class ="pre "> no_set_max_nreg()</ span > </ code > </ a > </ li >
1066
+ < li > < a class ="reference internal " href ="#tilelang.language.builtin.disable_warp_group_reg_alloc "> < code class ="docutils literal notranslate "> < span class ="pre "> disable_warp_group_reg_alloc()</ span > </ code > </ a > </ li >
1027
1067
< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_wait_parity "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_wait_parity()</ span > </ code > </ a > </ li >
1028
1068
< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_arrive "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_arrive()</ span > </ code > </ a > </ li >
1029
1069
< li > < a class ="reference internal " href ="#tilelang.language.builtin.mbarrier_expect_tx "> < code class ="docutils literal notranslate "> < span class ="pre "> mbarrier_expect_tx()</ span > </ code > </ a > </ li >
0 commit comments