V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py", 0]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_inductor/test_case.py", 1]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/test_case.py", 2]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/testing/_internal/common_utils.py", 3]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch-env/lib/python3.10/unittest/main.py", 4]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch-env/lib/python3.10/unittest/runner.py", 5]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch-env/lib/python3.10/unittest/suite.py", 6]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch-env/lib/python3.10/unittest/case.py", 7]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch-env/lib/python3.10/contextlib.py", 8]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/nn/modules/module.py", 9]} V0401 08:54:21.881000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/eval_frame.py", 10]} V0401 08:54:21.882000 140424060892160 torch/_dynamo/convert_frame.py:672] {"dynamo_start": {"stack": [{"line": 10031, "name": "", "filename": 0}, {"line": 14, "name": "run_tests", "filename": 1}, {"line": 41, "name": "run_tests", "filename": 2}, {"line": 1165, "name": "run_tests", "filename": 3}, {"line": 101, "name": "__init__", "filename": 4}, {"line": 271, "name": "runTests", "filename": 4}, {"line": 184, "name": "run", "filename": 5}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 650, "name": "__call__", "filename": 7}, {"line": 2866, "name": "run", "filename": 3}, {"line": 2838, "name": "_run_custom", "filename": 3}, {"line": 591, "name": "run", "filename": 7}, {"line": 549, "name": "_callTestMethod", "filename": 7}, {"line": 2739, "name": "wrapper", "filename": 3}, {"line": 9214, "name": "new_test", "filename": 0}, {"line": 79, "name": "inner", "filename": 8}, {"line": 8845, "name": "test_custom_op_fixed_layout_channels_last", "filename": 0}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 450, "name": "_fn", "filename": 10}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} V0401 08:54:21.895000 140424060892160 torch/_dynamo/output_graph.py:1189] {"dynamo_output_graph": {"sizes": {"l_x_": [1, 320, 128, 128], "out": [1, 320, 128, 128], "out_2": [1, 320, 128, 128], "out_3": [1, 320, 128, 128]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "74cabbbada68afbad8a921c47aa2b317"} class GraphModule(torch.nn.Module): def forward(self, L_x_ : torch.Tensor): l_x_ = L_x_ # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8809 in helper, code: out = F.gelu(x) out = torch._C._nn.gelu(l_x_); l_x_ = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8810 in helper, code: out = self.in_layers(out) out_2 = self.L__self___in_layers_0(out); out = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8815 in forward, code: out = torch.ops.test.baz(out) out_3 = torch.ops.test.baz(out_2); out_2 = None return (out_3,) V0401 08:54:21.914000 140424060892160 torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:112] {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "9d0885575d7f020cb3a1996185650901"} class (torch.nn.Module): def forward(self, arg0_1: "f32[1, 320, 128, 128]"): # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8809 in helper, code: out = F.gelu(x) mul: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(arg0_1, 0.5) mul_1: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(arg0_1, 0.7071067811865476); arg0_1 = None erf: "f32[1, 320, 128, 128]" = torch.ops.aten.erf.default(mul_1); mul_1 = None add: "f32[1, 320, 128, 128]" = torch.ops.aten.add.Tensor(erf, 1); erf = None mul_2: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(mul, add); mul = add = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8810 in helper, code: out = self.in_layers(out) rand: "f32[1, 320, 128, 128]" = torch.ops.aten.rand.default([1, 320, 128, 128], dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False) convert_element_type: "f32[1, 320, 128, 128]" = torch.ops.prims.convert_element_type.default(rand, torch.float32); rand = None clone: "f32[1, 320, 128, 128]" = torch.ops.aten.clone.default(convert_element_type, memory_format = torch.channels_last); convert_element_type = None gt: "b8[1, 320, 128, 128]" = torch.ops.aten.gt.Scalar(clone, 0.1); clone = None mul_3: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(gt, mul_2); gt = mul_2 = None mul_4: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(mul_3, 1.1111111111111112); mul_3 = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8815 in forward, code: out = torch.ops.test.baz(out) baz: "f32[1, 320, 128, 128]" = torch.ops.test.baz.default(mul_4); mul_4 = None return (baz,) V0401 08:54:22.062000 140424060892160 torch/_inductor/compile_fx.py:650] {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "cf729e565df6880d92548bb46eff9ab5"} class (torch.nn.Module): def forward(self, arg0_1: "f32[1, 320, 128, 128]"): # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8809 in helper, code: out = F.gelu(x) mul: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(arg0_1, 0.5) mul_1: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(arg0_1, 0.7071067811865476); arg0_1 = None erf: "f32[1, 320, 128, 128]" = torch.ops.aten.erf.default(mul_1); mul_1 = None add: "f32[1, 320, 128, 128]" = torch.ops.aten.add.Tensor(erf, 1); erf = None mul_2: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(mul, add); mul = add = None # No stacktrace found for following nodes inductor_seeds_default: "i64[1]" = torch.ops.prims.inductor_seeds.default(1, device(type='cuda', index=0)) inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0); inductor_seeds_default = None inductor_random_default: "f32[1, 320, 128, 128]" = torch.ops.prims.inductor_random.default([1, 320, 128, 128], inductor_lookup_seed_default, 'rand'); inductor_lookup_seed_default = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8810 in helper, code: out = self.in_layers(out) clone: "f32[1, 320, 128, 128]" = torch.ops.aten.clone.default(inductor_random_default, memory_format = torch.channels_last); inductor_random_default = None gt: "b8[1, 320, 128, 128]" = torch.ops.aten.gt.Scalar(clone, 0.1); clone = None mul_3: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(gt, mul_2); gt = mul_2 = None mul_4: "f32[1, 320, 128, 128]" = torch.ops.aten.mul.Tensor(mul_3, 1.1111111111111112); mul_3 = None # File: /data/users/jjwu/a/pytorch/test/inductor/test_torchinductor.py:8815 in forward, code: out = torch.ops.test.baz(out) baz: "f32[1, 320, 128, 128]" = torch.ops.test.baz.default(mul_4); mul_4 = None return (baz,) V0401 08:54:22.975000 140424060892160 torch/_inductor/graph.py:1268] {"inductor_output_code": {"filename": "/tmp/torchinductor_jjwu/pz/cpzf3cxhhnoarj4kjhg2wxhmski6yge4zox6h43vsrrdubhj7qnb.py"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "5dd8b94f459b07b5314187f78575d118"} from ctypes import c_void_p, c_long import torch import math import random import os import tempfile from math import inf, nan from torch._inductor.hooks import run_intermediate_hooks from torch._inductor.utils import maybe_profile from torch._inductor.codegen.memory_planning import _align as align from torch import device, empty_strided from torch._inductor.codecache import AsyncCompile from torch._inductor.select_algorithm import extern_kernels from torch._inductor.codegen.multi_kernel import MultiKernelCall aten = torch.ops.aten inductor_ops = torch.ops.inductor assert_size_stride = torch._C._dynamo.guards.assert_size_stride empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda alloc_from_pool = torch.ops.inductor._alloc_from_pool reinterpret_tensor = torch.ops.inductor._reinterpret_tensor async_compile = AsyncCompile() # kernel path: /tmp/torchinductor_jjwu/6p/c6pjjivx4yr7a6rhwljusernnkewuf53hvlnc2sy63rcnj764rhn.py # Source Nodes: [], Original ATen: [] triton_poi_fused_0 = async_compile.triton('triton_', ''' import triton import triton.language as tl from triton.compiler.compiler import AttrsDescriptor from torch._inductor import triton_helpers, triton_heuristics from torch._inductor.ir import ReductionHint, TileHint from torch._inductor.triton_helpers import libdevice, math as tl_math from torch._inductor.triton_heuristics import AutotuneHint from torch._inductor.utils import instance_descriptor @triton_heuristics.pointwise( size_hints=[8388608], filename=__file__, triton_meta={'signature': {0: '*i64', 1: '*fp32', 2: 'i32', 3: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 3), equal_to_1=(), divisible_by_8=(3,))]}, inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_0', 'mutated_arg_names': [], 'no_x_dim': False, 'backend_hash': 'e24e28e8c74b85ff7b61b41fc9160c05d25c32556bda76a915743727cec50966'}, min_elem_per_thread=0 ) @triton.jit def triton_(in_ptr0, out_ptr0, load_seed_offset, xnumel, XBLOCK : tl.constexpr): xnumel = 5242880 xoffset = tl.program_id(0) * XBLOCK xindex = xoffset + tl.arange(0, XBLOCK)[:] xmask = xindex < xnumel x0 = xindex tmp0 = tl.load(in_ptr0 + load_seed_offset) tmp1 = x0 tmp2 = tl.rand(tmp0, (tmp1).to(tl.uint32)) tl.store(out_ptr0 + (x0), tmp2, None) ''', device_str='cuda') import triton import triton.language as tl from torch._inductor.triton_heuristics import grid, split_scan_grid, start_graph, end_graph from torch._C import _cuda_getCurrentRawStream as get_raw_stream # kernel path: /tmp/torchinductor_jjwu/2g/c2gsdapii4jxiorppiwpmvslqswzpbmms26vhuqoh7nm27gxhdnf.py # Source Nodes: [out, out_2, out_3], Original ATen: [aten.gelu, aten.native_dropout, test.baz] # out => add, erf, mul, mul_1, mul_2 # out_2 => clone, gt, mul_3, mul_4 # out_3 => baz triton_poi_fused_baz_gelu_native_dropout_1 = async_compile.triton('triton_', ''' import triton import triton.language as tl from triton.compiler.compiler import AttrsDescriptor from torch._inductor import triton_helpers, triton_heuristics from torch._inductor.ir import ReductionHint, TileHint from torch._inductor.triton_helpers import libdevice, math as tl_math from torch._inductor.triton_heuristics import AutotuneHint from torch._inductor.utils import instance_descriptor @triton_heuristics.pointwise( size_hints=[16384, 512], tile_hint=TileHint.DEFAULT, filename=__file__, triton_meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': 0, 'device_type': 'cuda', 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4), equal_to_1=(), divisible_by_8=(3, 4))]}, inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_baz_gelu_native_dropout_1', 'mutated_arg_names': [], 'no_x_dim': False, 'backend_hash': 'e24e28e8c74b85ff7b61b41fc9160c05d25c32556bda76a915743727cec50966'}, min_elem_per_thread=0 ) @triton.jit def triton_(in_ptr0, in_ptr1, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): ynumel = 16384 xnumel = 320 yoffset = tl.program_id(1) * (tl.program_id(2) + 1) * YBLOCK yindex = yoffset + tl.arange(0, YBLOCK)[None, :] ymask = yindex < ynumel xoffset = tl.program_id(0) * XBLOCK xindex = xoffset + tl.arange(0, XBLOCK)[:, None] xmask = xindex < xnumel x1 = xindex y0 = yindex tmp0 = tl.load(in_ptr0 + (y0 + (16384*x1)), xmask, eviction_policy='evict_last') tmp4 = tl.load(in_ptr1 + (x1 + (320*y0)), xmask, eviction_policy='evict_last') tmp1 = 0.1 tmp2 = tmp0 > tmp1 tmp3 = tmp2.to(tl.float32) tmp5 = 0.5 tmp6 = tmp4 * tmp5 tmp7 = 0.7071067811865476 tmp8 = tmp4 * tmp7 tmp9 = libdevice.erf(tmp8) tmp10 = 1.0 tmp11 = tmp9 + tmp10 tmp12 = tmp6 * tmp11 tmp13 = tmp3 * tmp12 tmp14 = 1.1111111111111112 tmp15 = tmp13 * tmp14 tl.store(out_ptr0 + (x1 + (320*y0)), tmp15, xmask) ''', device_str='cuda') async_compile.wait(globals()) del async_compile def call(args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (1, 320, 128, 128), (5242880, 1, 40960, 320)) with torch.cuda._DeviceGuard(0): torch.cuda.set_device(0) buf0 = empty_strided_cuda((1, ), (1, ), torch.int64) # Source Nodes: [], Original ATen: [] aten.randint.low_out(-9223372036854775808, 9223372036854775807, [1], out=buf0) buf1 = empty_strided_cuda((1, 320, 128, 128), (5242880, 16384, 128, 1), torch.float32) # Source Nodes: [], Original ATen: [] stream0 = get_raw_stream(0) triton_poi_fused_0.run(buf0, buf1, 0, 5242880, grid=grid(5242880), stream=stream0) run_intermediate_hooks('inductor_random_default', buf1) del buf0 buf2 = empty_strided_cuda((1, 320, 128, 128), (5242880, 1, 40960, 320), torch.float32) # Source Nodes: [out, out_2, out_3], Original ATen: [aten.gelu, aten.native_dropout, test.baz] triton_poi_fused_baz_gelu_native_dropout_1.run(buf1, arg0_1, buf2, 16384, 320, grid=grid(16384, 320), stream=stream0) run_intermediate_hooks('mul_4', buf2) del arg0_1 del buf1 # Source Nodes: [out, out_2, out_3], Original ATen: [aten.gelu, aten.native_dropout, test.baz] buf3 = torch.ops.test.baz.default(buf2) run_intermediate_hooks('baz', buf3) del buf2 buf4 = buf3 del buf3 return (buf4, ) def benchmark_compiled_module(times=10, repeat=10): from torch._dynamo.testing import rand_strided from torch._inductor.utils import print_performance arg0_1 = rand_strided((1, 320, 128, 128), (5242880, 1, 40960, 320), device='cuda:0', dtype=torch.float32) fn = lambda: call([arg0_1]) return print_performance(fn, times=times, repeat=repeat) if __name__ == "__main__": from torch._inductor.wrapper_benchmark import compiled_module_main compiled_module_main('None', benchmark_compiled_module) V0401 08:54:22.980000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/convert_frame.py", 11]} V0401 08:54:22.980000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/utils.py", 12]} V0401 08:54:22.980000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/bytecode_transformation.py", 13]} V0401 08:54:22.980000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/symbolic_convert.py", 14]} V0401 08:54:22.980000 140424060892160 torch/_logging/structured.py:19] {"str": ["/data/users/jjwu/a/pytorch/torch/_dynamo/output_graph.py", 15]} V0401 08:54:22.980000 140424060892160 torch/_dynamo/guards.py:1194] {"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "2fd53e7b434e73e7bbdc496799a81b21"} [ {"code": "hasattr(L['x'], '_dynamo_dynamic_indices') == False", "stack": null, "user_stack": null}, {"code": "___check_obj_id(L['self'], 140423721840032)", "stack": null, "user_stack": null}, {"code": "___check_obj_id(L['self'].training, 7665376)", "stack": null, "user_stack": null}, {"code": "utils_device.CURRENT_DEVICE == None", "stack": [{"line": 10031, "name": "", "filename": 0}, {"line": 14, "name": "run_tests", "filename": 1}, {"line": 41, "name": "run_tests", "filename": 2}, {"line": 1165, "name": "run_tests", "filename": 3}, {"line": 101, "name": "__init__", "filename": 4}, {"line": 271, "name": "runTests", "filename": 4}, {"line": 184, "name": "run", "filename": 5}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 650, "name": "__call__", "filename": 7}, {"line": 2866, "name": "run", "filename": 3}, {"line": 2838, "name": "_run_custom", "filename": 3}, {"line": 591, "name": "run", "filename": 7}, {"line": 549, "name": "_callTestMethod", "filename": 7}, {"line": 2739, "name": "wrapper", "filename": 3}, {"line": 9214, "name": "new_test", "filename": 0}, {"line": 79, "name": "inner", "filename": 8}, {"line": 8845, "name": "test_custom_op_fixed_layout_channels_last", "filename": 0}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 450, "name": "_fn", "filename": 10}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 939, "name": "catch_errors", "filename": 11}, {"line": 802, "name": "_convert_frame", "filename": 11}, {"line": 400, "name": "_convert_frame_assert", "filename": 11}, {"line": 79, "name": "inner", "filename": 8}, {"line": 686, "name": "_compile", "filename": 11}, {"line": 262, "name": "time_wrapper", "filename": 12}, {"line": 541, "name": "compile_inner", "filename": 11}, {"line": 1036, "name": "transform_code_object", "filename": 13}, {"line": 165, "name": "_fn", "filename": 11}, {"line": 485, "name": "transform", "filename": 11}, {"line": 2105, "name": "__init__", "filename": 14}, {"line": 344, "name": "__init__", "filename": 15}, {"line": 467, "name": "init_ambient_guards", "filename": 15}], "user_stack": null}, {"code": "___check_current_backend(140423721840560)", "stack": [{"line": 10031, "name": "", "filename": 0}, {"line": 14, "name": "run_tests", "filename": 1}, {"line": 41, "name": "run_tests", "filename": 2}, {"line": 1165, "name": "run_tests", "filename": 3}, {"line": 101, "name": "__init__", "filename": 4}, {"line": 271, "name": "runTests", "filename": 4}, {"line": 184, "name": "run", "filename": 5}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 650, "name": "__call__", "filename": 7}, {"line": 2866, "name": "run", "filename": 3}, {"line": 2838, "name": "_run_custom", "filename": 3}, {"line": 591, "name": "run", "filename": 7}, {"line": 549, "name": "_callTestMethod", "filename": 7}, {"line": 2739, "name": "wrapper", "filename": 3}, {"line": 9214, "name": "new_test", "filename": 0}, {"line": 79, "name": "inner", "filename": 8}, {"line": 8845, "name": "test_custom_op_fixed_layout_channels_last", "filename": 0}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 450, "name": "_fn", "filename": 10}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 939, "name": "catch_errors", "filename": 11}, {"line": 802, "name": "_convert_frame", "filename": 11}, {"line": 400, "name": "_convert_frame_assert", "filename": 11}, {"line": 79, "name": "inner", "filename": 8}, {"line": 686, "name": "_compile", "filename": 11}, {"line": 262, "name": "time_wrapper", "filename": 12}, {"line": 541, "name": "compile_inner", "filename": 11}, {"line": 1036, "name": "transform_code_object", "filename": 13}, {"line": 165, "name": "_fn", "filename": 11}, {"line": 485, "name": "transform", "filename": 11}, {"line": 2105, "name": "__init__", "filename": 14}, {"line": 344, "name": "__init__", "filename": 15}, {"line": 473, "name": "init_ambient_guards", "filename": 15}], "user_stack": null}, {"code": "check_tensor(L['x'], Tensor, DispatchKeySet(CUDA, BackendSelect, ADInplaceOrView, AutogradCUDA), torch.float32, device=0, requires_grad=False, size=[1, 320, 128, 128], stride=[5242880, 1, 40960, 320])", "stack": null, "user_stack": null} ] V0401 08:54:21.882000 140424060892160 torch/_dynamo/convert_frame.py:672] {"dynamo_start": {"stack": [{"line": 10031, "name": "", "filename": 0}, {"line": 14, "name": "run_tests", "filename": 1}, {"line": 41, "name": "run_tests", "filename": 2}, {"line": 1165, "name": "run_tests", "filename": 3}, {"line": 101, "name": "__init__", "filename": 4}, {"line": 271, "name": "runTests", "filename": 4}, {"line": 184, "name": "run", "filename": 5}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 650, "name": "__call__", "filename": 7}, {"line": 2866, "name": "run2", "filename": 3}, {"line": 2838, "name": "_run_custom", "filename": 3}, {"line": 591, "name": "run", "filename": 7}, {"line": 549, "name": "_callTestMethod", "filename": 7}, {"line": 2739, "name": "wrapper", "filename": 3}, {"line": 9214, "name": "new_test", "filename": 0}, {"line": 79, "name": "inner", "filename": 8}, {"line": 8845, "name": "test_custom_op_fixed_layout_channels_last", "filename": 0}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 450, "name": "_fn", "filename": 10}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} V0401 08:54:21.882000 140424060892160 torch/_dynamo/convert_frame.py:672] {"dynamo_start": {"stack": [{"line": 10031, "name": "", "filename": 0}, {"line": 14, "name": "run_tests", "filename": 1}, {"line": 41, "name": "run_tests", "filename": 2}, {"line": 1165, "name": "run_tests", "filename": 3}, {"line": 101, "name": "__init__", "filename": 4}, {"line": 271, "name": "runTests", "filename": 4}, {"line": 184, "name": "run", "filename": 5}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 84, "name": "__call__", "filename": 6}, {"line": 122, "name": "run", "filename": 6}, {"line": 650, "name": "__call__", "filename": 7}, {"line": 2866, "name": "run2", "filename": 3}, {"line": 2838, "name": "_run_custom", "filename": 3}, {"line": 591, "name": "run", "filename": 7}, {"line": 549, "name": "_callTestMethod", "filename": 7}, {"line": 2739, "name": "wrapper", "filename": 3}, {"line": 9214, "name": "new_test", "filename": 0}, {"line": 79, "name": "inner", "filename": 8}, {"line": 8845, "name": "test_custom_op_fixed_layout_channels_last", "filename": 0}, {"line": 1527, "name": "_wrapped_call_impl", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}, {"line": 450, "name": "_fn", "filename": 10}, {"line": 1527, "name": "_wrapped_call_implaa", "filename": 9}, {"line": 1536, "name": "_call_impl", "filename": 9}]}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}