.gitignore README.md requirements.txt setup.py tb_eval/__init__.py tb_eval/constants.py tb_eval/initializations.py tb_eval/run.py tb_eval.egg-info/PKG-INFO tb_eval.egg-info/SOURCES.txt tb_eval.egg-info/dependency_links.txt tb_eval.egg-info/entry_points.txt tb_eval.egg-info/top_level.txt tb_eval/data/__init__.py tb_eval/data/ROCm/data/ROCm_eval_complex_instruct_v1.json tb_eval/data/ROCm/data/ROCm_v1/gemm.py tb_eval/data/ROCm/data/ROCm_v1/layernorm.py tb_eval/data/ROCm/data/ROCm_v1/moe_gemm.py tb_eval/data/ROCm/data/ROCm_v1/multreduce_matmul_dot_kernel.py tb_eval/data/ROCm/data/ROCm_v1/naive_softmax.py tb_eval/data/ROCm/data/ROCm_v1/rmsnorm_bwd.py tb_eval/data/ROCm/data/ROCm_v1/rmsnorm_fwd.py tb_eval/data/ROCm/data/ROCm_v1/softmax.py tb_eval/data/ROCm/data/ROCm_v1/test_add_kernel.py tb_eval/data/ROCm/data/ROCm_v1/test_batched_vecmat.py tb_eval/data/ROCm/data/ROCm_v1/test_block_copy.py tb_eval/data/ROCm/data/ROCm_v1/test_block_pointer_matmul.py tb_eval/data/ROCm/data/ROCm_v1/test_cast_matmul.py tb_eval/data/ROCm/data/ROCm_v1/test_chained_dot_fp8.py tb_eval/data/ROCm/data/ROCm_v1/test_chained_matmul.py tb_eval/data/ROCm/data/ROCm_v1/test_flashattention_fwd.py tb_eval/data/ROCm/data/ROCm_v1/test_gemm_fusion.py tb_eval/data/ROCm/data/ROCm_v1/test_gemm_no_scf.py tb_eval/data/ROCm/data/ROCm_v1/test_iv_dependent_matmul.py tb_eval/data/ROCm/data/ROCm_v1/test_kernel_dot.py tb_eval/data/ROCm/data/ROCm_v1/test_kernel_sub.py tb_eval/data/ROCm/data/ROCm_v1/test_load_reduce.py tb_eval/data/ROCm/data/ROCm_v1/test_matmul_MXFP.py tb_eval/data/ROCm/data/ROCm_v1/test_randn.py tb_eval/data/ROCm/data/ROCm_v1/test_random_int.py tb_eval/data/ROCm/data/ROCm_v1/test_reverse_range.py tb_eval/data/ROCm/data/ROCm_v1/test_tma_store_gemm.py tb_eval/data/ROCm/data/ROCm_v1/test_triton_flip.py tb_eval/data/ROCm/data/ROCm_v1/test_triton_sort.py tb_eval/data/ROCm/data/ROCm_v1/test_triton_swizzle2d.py tb_eval/data/ROCm/data/ROCm_v1/triton_multreduce_matmul_kernel.py tb_eval/data/ROCm/data/ROCm_v1_autotune/gemm.py tb_eval/data/ROCm/data/ROCm_v1_autotune/layernorm.py tb_eval/data/ROCm/data/ROCm_v1_autotune/moe_gemm.py tb_eval/data/ROCm/data/ROCm_v1_autotune/multreduce_matmul_dot_kernel.py tb_eval/data/ROCm/data/ROCm_v1_autotune/naive_softmax.py tb_eval/data/ROCm/data/ROCm_v1_autotune/rmsnorm_bwd.py tb_eval/data/ROCm/data/ROCm_v1_autotune/rmsnorm_fwd.py tb_eval/data/ROCm/data/ROCm_v1_autotune/softmax.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_add_kernel.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_batched_vecmat.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_block_copy.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_block_pointer_matmul.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_cast_matmul.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_chained_dot_fp8.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_chained_matmul.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_flashattention_fwd.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_gemm_fusion.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_gemm_no_scf.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_iv_dependent_matmul.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_kernel_dot.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_kernel_sub.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_load_reduce.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_matmul_MXFP.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_randn.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_random_int.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_reverse_range.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_tma_store_gemm.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_triton_flip.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_triton_sort.py tb_eval/data/ROCm/data/ROCm_v1_autotune/test_triton_swizzle2d.py tb_eval/data/ROCm/data/ROCm_v1_autotune/triton_multreduce_matmul_kernel.py tb_eval/data/ROCm/data/performance/golden_results/add_kernel_perf.json tb_eval/data/ROCm/data/performance/golden_results/batched_vecmat_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/block_copy_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/block_pointer_matmul_perf.json tb_eval/data/ROCm/data/performance/golden_results/cast_matmul_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/chained_dot_fp8_perf.json tb_eval/data/ROCm/data/performance/golden_results/chained_matmul_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/flash_attention_fwd_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/gemm_fusion_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/gemm_no_scf_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/gemm_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/iv_dependent_matmul_perf.json tb_eval/data/ROCm/data/performance/golden_results/kernel_dot_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/kernel_sub_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/layernorm_triton_fwd_perf.json tb_eval/data/ROCm/data/performance/golden_results/load_reduce_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/matmul_mxfp_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/moe_gemm_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/multreduce_matmul_dot_perf.json tb_eval/data/ROCm/data/performance/golden_results/reverse_range_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/rmsnorm_fwd_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/rmsnorm_triton_bwd_perf.json tb_eval/data/ROCm/data/performance/golden_results/softmax_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/tma_store_gemm_triton_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_flip_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_multreduce_matmul_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_rand_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_randint_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_sort_perf.json tb_eval/data/ROCm/data/performance/golden_results/triton_swizzle2d_perf.json tb_eval/data/TritonBench/.gitignore tb_eval/data/TritonBench/LICENSE tb_eval/data/TritonBench/README.md tb_eval/data/TritonBench/__init__.py tb_eval/data/TritonBench/data/TritonBench_G_comp_alpac_v1_fixed_with_difficulty.json tb_eval/data/TritonBench/data/__init__.py tb_eval/data/TritonBench/data/TritonBench_G_v1/__init__.py tb_eval/data/TritonBench/data/TritonBench_G_v1/adam_update_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/add_example.py tb_eval/data/TritonBench/data/TritonBench_G_v1/add_value.py tb_eval/data/TritonBench/data/TritonBench_G_v1/apply_penalty.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_forward_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_fwd_triton1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_fwd_triton2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_fwd_triton3.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_kernel_aligned.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_llama.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attention_score.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attn_fwd_causal.py tb_eval/data/TritonBench/data/TritonBench_G_v1/attn_fwd_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/batched_vecmat_mult.py tb_eval/data/TritonBench/data/TritonBench_G_v1/bgmv_expand_slice.py tb_eval/data/TritonBench/data/TritonBench_G_v1/bgmv_shrink_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/block_sparse_attn.py tb_eval/data/TritonBench/data/TritonBench_G_v1/bmm_chunk_bwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/bmm_chunk_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/bmm_optimized.py tb_eval/data/TritonBench/data/TritonBench_G_v1/cache_transform_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_bwd_dqkg.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_cumsum_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_cumsum_vector.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_delta_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_gate_recurrence.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_gated_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_gla_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_gla_simple.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_linear_attn.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_retention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunk_retention_ops.py tb_eval/data/TritonBench/data/TritonBench_G_v1/chunked_cumsum_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/context_attn_bloom.py tb_eval/data/TritonBench/data/TritonBench_G_v1/context_attn_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/context_attn_llama.py tb_eval/data/TritonBench/data/TritonBench_G_v1/context_attn_mistral.py tb_eval/data/TritonBench/data/TritonBench_G_v1/context_attn_nopad.py tb_eval/data/TritonBench/data/TritonBench_G_v1/cosine_compute.py tb_eval/data/TritonBench/data/TritonBench_G_v1/cross_entropy1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/cross_entropy2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/cross_entropy_ops.py tb_eval/data/TritonBench/data/TritonBench_G_v1/decay_cumsum.py tb_eval/data/TritonBench/data/TritonBench_G_v1/dequantize_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/dequantize_rowwise.py tb_eval/data/TritonBench/data/TritonBench_G_v1/destindex_copy.py tb_eval/data/TritonBench/data/TritonBench_G_v1/destindex_copy_kv1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/destindex_copy_kv2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/diag_ssm_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/dropout_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/embedding_triton_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/f8_conversion_utils.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fast_ce_loss.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fast_layernorm.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fast_rms_layernorm.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fast_rope_embedding.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fifth_order_sph_harmonics.py tb_eval/data/TritonBench/data/TritonBench_G_v1/flash_attn.py tb_eval/data/TritonBench/data/TritonBench_G_v1/flash_decode2_llama.py tb_eval/data/TritonBench/data/TritonBench_G_v1/flash_decode2_phi.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fp4_to_bf16.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fp4_to_bf16_conversion.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_activation.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_layernorm_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_recurrent_delta.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_recurrent_hgrn.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_recurrent_retention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_rotary_embedding.py tb_eval/data/TritonBench/data/TritonBench_G_v1/fused_rwkv6_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/geglu_tanh_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/index_select_bwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/index_select_cat.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int4_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int8_dequant_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int8_matmul_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int8_matmul_quantization.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int8_quantization.py tb_eval/data/TritonBench/data/TritonBench_G_v1/int_scaled_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/isfinite_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/iv_dependent_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kcache_copy_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kldiv_compute.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kldiv_ops.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kldiv_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/ksoftmax_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kv_cache_copy.py tb_eval/data/TritonBench/data/TritonBench_G_v1/kv_cache_filling.py tb_eval/data/TritonBench/data/TritonBench_G_v1/l2_norm_bwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/l2_norm_triton1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/l2_norm_triton2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layer_norm_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layer_norm_liger.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layer_norm_ops.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layer_norm_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layer_norm_welfold.py tb_eval/data/TritonBench/data/TritonBench_G_v1/layernorm_fwd_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/lightning_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/llama_ff_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/log_softmax.py tb_eval/data/TritonBench/data/TritonBench_G_v1/logsumexp_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/lora_expand_gemv.py tb_eval/data/TritonBench/data/TritonBench_G_v1/masked_add_cuda.py tb_eval/data/TritonBench/data/TritonBench_G_v1/masked_select.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_dequant_int4.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_dequantize.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_dequantize_int4.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_leakyrelu.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_leakyrelu_fp8.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_persistent_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_tma.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_triton1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_triton2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matmul_triton_autotune.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matrix_reduction.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matrix_transpose.py tb_eval/data/TritonBench/data/TritonBench_G_v1/matrix_vector_multip.py tb_eval/data/TritonBench/data/TritonBench_G_v1/max_reduction.py tb_eval/data/TritonBench/data/TritonBench_G_v1/mean_reduction.py tb_eval/data/TritonBench/data/TritonBench_G_v1/mixed_sparse_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/mul_exponent_compensator.py tb_eval/data/TritonBench/data/TritonBench_G_v1/multinomial_sampling.py tb_eval/data/TritonBench/data/TritonBench_G_v1/nested_loops_processing.py tb_eval/data/TritonBench/data/TritonBench_G_v1/parallel_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/parallel_retention_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/pow_scalar_tensor.py tb_eval/data/TritonBench/data/TritonBench_G_v1/quant_transpose_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/quantize_copy_kv.py tb_eval/data/TritonBench/data/TritonBench_G_v1/quantize_global.py tb_eval/data/TritonBench/data/TritonBench_G_v1/quantize_kv_copy.py tb_eval/data/TritonBench/data/TritonBench_G_v1/quantize_kv_transform.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rbe_triton_transform.py tb_eval/data/TritonBench/data/TritonBench_G_v1/relu_strided_buffer.py tb_eval/data/TritonBench/data/TritonBench_G_v1/relu_triton_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/reversed_cumsum.py tb_eval/data/TritonBench/data/TritonBench_G_v1/reversed_cumsum_scalar.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rms_matmul_rbe.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rms_norm_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rms_rbe_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rmsnorm_fused.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rmsnorm_fused_llama.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rmsnorm_implementation.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rmsnorm_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rope_backward_transform.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rope_embedding.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rope_transform.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rotary_emb.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rotary_emb_nopad.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rotary_transform.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rotary_transform_ops.py tb_eval/data/TritonBench/data/TritonBench_G_v1/rowwise_quantization_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/seeded_dropout.py tb_eval/data/TritonBench/data/TritonBench_G_v1/sgmv_expand_slice.py tb_eval/data/TritonBench/data/TritonBench_G_v1/sin_computation.py tb_eval/data/TritonBench/data/TritonBench_G_v1/sin_kernel.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_flaggems.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_optimize.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_reducev.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_triton1.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_triton2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/softmax_triton3.py tb_eval/data/TritonBench/data/TritonBench_G_v1/spinning_lock_reduction.py tb_eval/data/TritonBench/data/TritonBench_G_v1/square_matrix.py tb_eval/data/TritonBench/data/TritonBench_G_v1/streamk_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/swiglu_backward.py tb_eval/data/TritonBench/data/TritonBench_G_v1/swiglu_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/swiglu_triton.py tb_eval/data/TritonBench/data/TritonBench_G_v1/token_attn_llama2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/token_attn_mistral.py tb_eval/data/TritonBench/data/TritonBench_G_v1/token_attn_reduceV.py tb_eval/data/TritonBench/data/TritonBench_G_v1/token_softmax_bloom.py tb_eval/data/TritonBench/data/TritonBench_G_v1/token_softmax_llama.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_argmax.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_attention.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_conv2d_fwd.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_linear_activation.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_matmul.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_mul2.py tb_eval/data/TritonBench/data/TritonBench_G_v1/triton_softmax.py tb_eval/data/TritonBench/data/TritonBench_G_v1/uniform_sampling.py tb_eval/data/TritonBench/data/TritonBench_G_v1/var_len_copy.py tb_eval/data/TritonBench/data/TritonBench_G_v1/vector_addition.py tb_eval/data/TritonBench/data/TritonBench_G_v1/vector_addition_custom.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_run.txt.v2 tb_eval/data/TritonBench/performance_metrics/perf_G/golden_run.txt.v3 tb_eval/data/TritonBench/performance_metrics/perf_G/performance_utils.py tb_eval/data/TritonBench/performance_metrics/perf_G/refactor.py tb_eval/data/TritonBench/performance_metrics/perf_G/run_golden_tests.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/adam_update_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/add_example_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/add_value_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/apply_penalty_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_forward_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_fwd_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_fwd_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_fwd_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_kernel_aligned_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attention_score_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attn_fwd_causal_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/attn_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/batched_vecmat_mult_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/bgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/bgmv_shrink_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/block_sparse_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/bmm_chunk_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/bmm_chunk_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/bmm_optimized_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/cache_transform_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_bwd_dqkg_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_cumsum_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_cumsum_vector_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_delta_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_gate_recurrence_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_gated_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_gla_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_gla_simple_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_linear_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_retention_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunk_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/chunked_cumsum_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/context_attn_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/context_attn_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/context_attn_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/context_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/context_attn_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/cosine_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/cross_entropy1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/cross_entropy2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/cross_entropy_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/decay_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/dequantize_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/dequantize_rowwise_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/destindex_copy_kv1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/destindex_copy_kv2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/destindex_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/diag_ssm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/dropout_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/embedding_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/f8_conversion_utils_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fast_ce_loss_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fast_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fast_rms_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fast_rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fifth_order_sph_harmonics_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/flash_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/flash_decode2_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/flash_decode2_phi_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fp4_to_bf16_conversion_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fp4_to_bf16_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_layernorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_recurrent_delta_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_recurrent_hgrn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_recurrent_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_rotary_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/fused_rwkv6_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/geglu_tanh_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/index_select_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/index_select_cat_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int4_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int8_dequant_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int8_matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int8_matmul_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int8_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/int_scaled_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/isfinite_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/iv_dependent_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kcache_copy_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kldiv_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kldiv_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kldiv_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/ksoftmax_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kv_cache_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/kv_cache_filling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/l2_norm_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/l2_norm_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/l2_norm_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layer_norm_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layer_norm_liger_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layer_norm_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layer_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layer_norm_welfold_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/layernorm_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/lightning_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/llama_ff_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/log_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/logsumexp_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/lora_expand_gemv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/masked_add_cuda_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/masked_select_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_dequant_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_dequantize_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_dequantize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_leakyrelu_fp8_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_leakyrelu_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_persistent_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_tma_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matmul_triton_autotune_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matrix_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matrix_transpose_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/matrix_vector_multip_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/max_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/mean_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/mixed_sparse_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/mul_exponent_compensator_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/multinomial_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/nested_loops_processing_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/parallel_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/parallel_retention_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/pow_scalar_tensor_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/quant_transpose_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/quantize_copy_kv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/quantize_global_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/quantize_kv_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/quantize_kv_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rbe_triton_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/relu_strided_buffer_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/relu_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/reversed_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/reversed_cumsum_scalar_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rms_matmul_rbe_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rms_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rms_rbe_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rmsnorm_fused_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rmsnorm_fused_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rmsnorm_implementation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rmsnorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rope_backward_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rope_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rotary_emb_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rotary_emb_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rotary_transform_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rotary_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/rowwise_quantization_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/seeded_dropout_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/sgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/sin_computation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/sin_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_flaggems_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_optimize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_reducev_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/softmax_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/spinning_lock_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/square_matrix_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/streamk_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/swiglu_backward_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/swiglu_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/swiglu_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/token_attn_llama2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/token_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/token_attn_reduceV_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/token_softmax_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/token_softmax_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_argmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_conv2d_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_linear_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_mul2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/triton_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/uniform_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/var_len_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/vector_addition_custom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics/vector_addition_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/adam_update_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/add_example_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/add_value_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/apply_penalty_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_forward_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_fwd_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_fwd_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_fwd_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_kernel_aligned_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attention_score_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attn_fwd_causal_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/attn_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/batched_vecmat_mult_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/bgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/bgmv_shrink_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/block_sparse_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/bmm_chunk_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/bmm_chunk_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/bmm_optimized_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/cache_transform_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_bwd_dqkg_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_cumsum_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_cumsum_vector_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_delta_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_gate_recurrence_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_gated_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_gla_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_gla_simple_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_linear_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_retention_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunk_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/chunked_cumsum_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/context_attn_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/context_attn_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/context_attn_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/context_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/context_attn_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/cosine_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/cross_entropy1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/cross_entropy2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/cross_entropy_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/decay_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/dequantize_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/dequantize_rowwise_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/destindex_copy_kv1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/destindex_copy_kv2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/destindex_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/diag_ssm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/dropout_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/embedding_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/f8_conversion_utils_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fast_ce_loss_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fast_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fast_rms_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fast_rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fifth_order_sph_harmonics_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/flash_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/flash_decode2_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/flash_decode2_phi_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fp4_to_bf16_conversion_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fp4_to_bf16_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_layernorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_recurrent_delta_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_recurrent_hgrn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_recurrent_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_rotary_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/fused_rwkv6_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/geglu_tanh_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/index_select_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/index_select_cat_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int4_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int8_dequant_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int8_matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int8_matmul_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int8_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/int_scaled_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/isfinite_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/iv_dependent_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kcache_copy_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kldiv_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kldiv_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kldiv_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/ksoftmax_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kv_cache_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/kv_cache_filling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/l2_norm_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/l2_norm_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/l2_norm_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layer_norm_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layer_norm_liger_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layer_norm_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layer_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layer_norm_welfold_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/layernorm_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/lightning_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/llama_ff_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/log_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/logsumexp_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/lora_expand_gemv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/masked_add_cuda_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/masked_select_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_dequant_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_dequantize_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_dequantize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_leakyrelu_fp8_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_leakyrelu_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_persistent_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_tma_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matmul_triton_autotune_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matrix_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matrix_transpose_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/matrix_vector_multip_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/max_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/mean_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/mixed_sparse_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/mul_exponent_compensator_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/multinomial_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/nested_loops_processing_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/parallel_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/parallel_retention_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/pow_scalar_tensor_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/quant_transpose_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/quantize_copy_kv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/quantize_global_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/quantize_kv_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/quantize_kv_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rbe_triton_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/relu_strided_buffer_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/relu_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/reversed_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/reversed_cumsum_scalar_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rms_matmul_rbe_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rms_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rms_rbe_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rmsnorm_fused_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rmsnorm_fused_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rmsnorm_implementation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rmsnorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rope_backward_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rope_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rotary_emb_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rotary_emb_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rotary_transform_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rotary_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/rowwise_quantization_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/seeded_dropout_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/sgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/sin_computation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/sin_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_flaggems_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_optimize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_reducev_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/softmax_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/spinning_lock_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/square_matrix_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/streamk_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/swiglu_backward_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/swiglu_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/swiglu_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/token_attn_llama2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/token_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/token_attn_reduceV_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/token_softmax_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/token_softmax_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_argmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_conv2d_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_linear_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_mul2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/triton_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/uniform_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/var_len_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/vector_addition_custom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_metrics_refactor/vector_addition_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/adam_update_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/add_example.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/add_value.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/apply_penalty.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_forward_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_fwd_triton1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_fwd_triton2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_fwd_triton3.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_kernel_aligned.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_llama.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attention_score.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attn_fwd_causal.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/attn_fwd_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/batched_vecmat_mult.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/bgmv_expand_slice.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/bgmv_shrink_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/block_sparse_attn.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/bmm_chunk_bwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/bmm_chunk_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/bmm_optimized.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/cache_transform_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_bwd_dqkg.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_cumsum_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_cumsum_vector.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_delta_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_gate_recurrence.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_gated_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_gla_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_gla_simple.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_linear_attn.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_retention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunk_retention_ops.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/chunked_cumsum_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/context_attn_bloom.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/context_attn_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/context_attn_llama.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/context_attn_mistral.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/context_attn_nopad.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/cosine_compute.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/cross_entropy1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/cross_entropy2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/cross_entropy_ops.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/decay_cumsum.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/dequantize_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/dequantize_rowwise.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/destindex_copy.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/destindex_copy_kv1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/destindex_copy_kv2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/diag_ssm_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/dropout_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/embedding_triton_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/f8_conversion_utils.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fast_ce_loss.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fast_layernorm.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fast_rms_layernorm.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fast_rope_embedding.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fifth_order_sph_harmonics.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/flash_attn.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/flash_decode2_llama.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/flash_decode2_phi.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fp4_to_bf16.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fp4_to_bf16_conversion.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_activation.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_layernorm_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_recurrent_delta.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_recurrent_hgrn.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_recurrent_retention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_rotary_embedding.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/fused_rwkv6_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/geglu_tanh_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/index_select_bwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/index_select_cat.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int4_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int8_dequant_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int8_matmul_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int8_matmul_quantization.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int8_quantization.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/int_scaled_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/isfinite_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/iv_dependent_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kcache_copy_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kldiv_compute.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kldiv_ops.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kldiv_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/ksoftmax_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kv_cache_copy.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/kv_cache_filling.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/l2_norm_bwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/l2_norm_triton1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/l2_norm_triton2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layer_norm_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layer_norm_liger.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layer_norm_ops.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layer_norm_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layer_norm_welfold.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/layernorm_fwd_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/lightning_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/llama_ff_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/log_softmax.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/logsumexp_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/lora_expand_gemv.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/masked_add_cuda.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/masked_select.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_dequant_int4.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_dequantize.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_dequantize_int4.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_leakyrelu.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_leakyrelu_fp8.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_persistent_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_tma.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_triton1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_triton2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matmul_triton_autotune.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matrix_reduction.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matrix_transpose.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/matrix_vector_multip.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/max_reduction.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/mean_reduction.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/mixed_sparse_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/mul_exponent_compensator.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/multinomial_sampling.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/nested_loops_processing.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/parallel_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/parallel_retention_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/pow_scalar_tensor.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/quant_transpose_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/quantize_copy_kv.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/quantize_global.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/quantize_kv_copy.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/quantize_kv_transform.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rbe_triton_transform.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/relu_strided_buffer.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/relu_triton_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/reversed_cumsum.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/reversed_cumsum_scalar.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rms_matmul_rbe.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rms_norm_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rms_rbe_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rmsnorm_fused.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rmsnorm_fused_llama.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rmsnorm_implementation.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rmsnorm_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rope_backward_transform.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rope_embedding.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rope_transform.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rotary_emb.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rotary_emb_nopad.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rotary_transform.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rotary_transform_ops.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/rowwise_quantization_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/seeded_dropout.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/sgmv_expand_slice.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/sin_computation.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/sin_kernel.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_flaggems.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_optimize.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_reducev.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_triton1.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_triton2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/softmax_triton3.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/spinning_lock_reduction.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/square_matrix.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/streamk_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/swiglu_backward.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/swiglu_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/swiglu_triton.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/token_attn_llama2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/token_attn_mistral.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/token_attn_reduceV.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/token_softmax_bloom.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/token_softmax_llama.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_argmax.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_attention.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_conv2d_fwd.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_linear_activation.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_matmul.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_mul2.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/triton_softmax.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/uniform_sampling.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/var_len_copy.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/vector_addition.json tb_eval/data/TritonBench/performance_metrics/perf_G/golden_results/vector_addition_custom.json tb_eval/data/TritonBench/performance_metrics/perf_G/run_bench/multiprocess_gpu_run.py tb_eval/data/TritonBench/performance_metrics/perf_G/run_bench/write_file.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/__init__.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/adam_update_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/adam_update_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/add_example.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/add_example_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/add_value.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/add_value_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/apply_penalty.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/apply_penalty_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_forward_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_forward_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton3.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_fwd_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_kernel_aligned.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_kernel_aligned_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_llama.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_score.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attention_score_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attn_fwd_causal.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attn_fwd_causal_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attn_fwd_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/attn_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/batched_vecmat_mult.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/batched_vecmat_mult_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bgmv_expand_slice.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bgmv_shrink_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bgmv_shrink_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/block_sparse_attn.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/block_sparse_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_chunk_bwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_chunk_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_chunk_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_chunk_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_optimized.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/bmm_optimized_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cache_transform_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cache_transform_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_bwd_dqkg.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_bwd_dqkg_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_cumsum_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_cumsum_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_cumsum_vector.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_cumsum_vector_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_delta_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_delta_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gate_recurrence.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gate_recurrence_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gated_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gated_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gla_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gla_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gla_simple.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_gla_simple_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_linear_attn.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_linear_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_retention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_retention_ops.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_retention_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunk_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunked_cumsum_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/chunked_cumsum_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_bloom.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_llama.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_mistral.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_nopad.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/context_attn_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cosine_compute.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cosine_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy_ops.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/cross_entropy_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/decay_cumsum.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/decay_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dequantize_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dequantize_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dequantize_rowwise.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dequantize_rowwise_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy_kv1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy_kv1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy_kv2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy_kv2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/destindex_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/diag_ssm_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/diag_ssm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dropout_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/dropout_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/embedding_triton_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/embedding_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/f8_conversion_utils.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/f8_conversion_utils_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_ce_loss.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_ce_loss_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_layernorm.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_rms_layernorm.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_rms_layernorm_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_rope_embedding.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fast_rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fifth_order_sph_harmonics.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fifth_order_sph_harmonics_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_attn.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_attn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_decode2_llama.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_decode2_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_decode2_phi.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/flash_decode2_phi_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fp4_to_bf16.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fp4_to_bf16_conversion.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fp4_to_bf16_conversion_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fp4_to_bf16_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_activation.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_layernorm_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_layernorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_delta.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_delta_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_hgrn.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_hgrn_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_retention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_recurrent_retention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_rotary_embedding.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_rotary_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_rwkv6_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/fused_rwkv6_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/geglu_tanh_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/geglu_tanh_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/index_select_bwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/index_select_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/index_select_cat.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/index_select_cat_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int4_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int4_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_dequant_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_dequant_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_matmul_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_matmul_quantization.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_matmul_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_quantization.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int8_quantization_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int_scaled_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/int_scaled_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/isfinite_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/isfinite_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/iv_dependent_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/iv_dependent_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kcache_copy_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kcache_copy_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_compute.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_compute_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_ops.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kldiv_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/ksoftmax_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/ksoftmax_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kv_cache_copy.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kv_cache_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kv_cache_filling.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/kv_cache_filling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_bwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_bwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_triton1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_triton2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/l2_norm_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_liger.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_liger_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_ops.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_welfold.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layer_norm_welfold_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layernorm_fwd_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/layernorm_fwd_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/lightning_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/lightning_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/llama_ff_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/llama_ff_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/log_softmax.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/log_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/logsumexp_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/logsumexp_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/lora_expand_gemv.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/lora_expand_gemv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/masked_add_cuda.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/masked_add_cuda_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/masked_select.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/masked_select_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequant_int4.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequant_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequantize.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequantize_int4.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequantize_int4_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_dequantize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_leakyrelu.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_leakyrelu_fp8.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_leakyrelu_fp8_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_leakyrelu_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_persistent_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_persistent_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_tma.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_tma_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton_autotune.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matmul_triton_autotune_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_reduction.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_transpose.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_transpose_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_vector_multip.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/matrix_vector_multip_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/max_reduction.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/max_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mean_reduction.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mean_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mixed_sparse_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mixed_sparse_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mul_exponent_compensator.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/mul_exponent_compensator_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/multinomial_sampling.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/multinomial_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/nested_loops_processing.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/nested_loops_processing_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/parallel_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/parallel_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/parallel_retention_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/parallel_retention_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/pow_scalar_tensor.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/pow_scalar_tensor_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quant_transpose_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quant_transpose_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_copy_kv.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_copy_kv_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_global.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_global_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_kv_copy.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_kv_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_kv_transform.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/quantize_kv_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rbe_triton_transform.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rbe_triton_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/relu_strided_buffer.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/relu_strided_buffer_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/relu_triton_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/relu_triton_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/reversed_cumsum.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/reversed_cumsum_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/reversed_cumsum_scalar.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/reversed_cumsum_scalar_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_matmul_rbe.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_matmul_rbe_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_norm_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_norm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_rbe_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rms_rbe_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_fused.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_fused_llama.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_fused_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_fused_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_implementation.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_implementation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rmsnorm_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_backward_transform.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_backward_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_embedding.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_embedding_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_transform.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rope_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_emb.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_emb_nopad.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_emb_nopad_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_emb_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_transform.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_transform_ops.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_transform_ops_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rotary_transform_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rowwise_quantization_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/rowwise_quantization_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/seeded_dropout.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/seeded_dropout_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sgmv_expand_slice.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sgmv_expand_slice_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sin_computation.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sin_computation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sin_kernel.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/sin_kernel_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_flaggems.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_flaggems_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_optimize.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_optimize_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_reducev.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_reducev_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton1.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton1_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton3.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/softmax_triton3_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/spinning_lock_reduction.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/spinning_lock_reduction_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/square_matrix.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/square_matrix_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/streamk_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/streamk_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_backward.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_backward_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_triton.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/swiglu_triton_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_llama2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_llama2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_mistral.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_mistral_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_reduceV.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_attn_reduceV_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_softmax_bloom.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_softmax_bloom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_softmax_llama.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/token_softmax_llama_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_argmax.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_argmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_attention.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_attention_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_conv2d_fwd.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_conv2d_fwd_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_linear_activation.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_linear_activation_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_matmul.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_matmul_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_mul2.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_mul2_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_softmax.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/triton_softmax_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/uniform_sampling.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/uniform_sampling_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/var_len_copy.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/var_len_copy_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/vector_addition.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/vector_addition_custom.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/vector_addition_custom_perf.py tb_eval/data/TritonBench/performance_metrics/perf_G/test_golden_metrics/vector_addition_perf.py tb_eval/evaluators/ROCm_correctness.py tb_eval/evaluators/TB_correctness.py tb_eval/evaluators/__init__.py tb_eval/evaluators/base.py tb_eval/evaluators/interface.py tb_eval/helpers/__init__.py tb_eval/helpers/generators.py tb_eval/helpers/helper.py tb_eval/helpers/time.py tb_eval/metrics/__init__.py tb_eval/metrics/accuracy.py tb_eval/metrics/base.py tb_eval/metrics/passk.py tb_eval/perf/2_efficiency.py tb_eval/perf/base.py tb_eval/perf/efficiency.py tb_eval/perf/performance_utils.py tb_eval/perf/ROCm/efficiency.py tb_eval/perf/ROCm/performance_utils_pytest.py tb_eval/perf/run_bench/multiprocess_gpu_run.py tb_eval/perf/run_bench/performance_utils.py tb_eval/perf/run_bench/write_file.py tb_eval/processors/__init__.py tb_eval/processors/base.py tb_eval/processors/llm.py