Skip to content

[Performance] Performance regression with int64 indices INDEX_DEFAULT_I64=ON (PR #6143) #6691

Description

@trevor-m

I've started noticing a large performance regression affecting Keras MobileNetV2 caused by INDEX_DEFAULT_I64=ON (PR #6143). This is on an AWS m5.12xlarge instance.

INDEX_DEFAULT_I64 setting Frames per second
ON 66.56
OFF 435.49

Profile with INDEX_DEFAULT_I64=OFF (fast)

Node Name                                            Ops                                                 Time(us)  Time(%)  Shape                 Inputs  Outputs  
---------                                            ---                                                 --------  -------  -----                 ------  -------  
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7  64.704    3.571    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6  53.362    2.945    (1, 2, 112, 112, 16)  3       1        
fused_nn_pad_3                                       fused_nn_pad_3                                      50.582    2.791    (1, 6, 113, 113, 16)  1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5  47.874    2.642    (1, 6, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_6             fused_nn_contrib_conv2d_NCHWc_add_clip_6            46.828    2.584    (1, 6, 112, 112, 16)  3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8  42.364    2.338    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9  39.554    2.183    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8  39.418    2.175    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_4              fused_nn_contrib_conv2d_NCHWc_add_add_4             38.871    2.145    (1, 2, 56, 56, 12)    4       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9  37.926    2.093    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_5             fused_nn_contrib_conv2d_NCHWc_add_clip_5            37.407    2.064    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_51            fused_nn_contrib_conv2d_NCHWc_add_clip_5            35.349    1.951    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip               fused_nn_contrib_conv2d_NCHWc_add_clip              34.692    1.915    (1, 80, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_6                  fused_nn_contrib_conv2d_NCHWc_add_6                 34.052    1.879    (1, 1, 112, 112, 16)  3       1        
fused_nn_contrib_conv2d_NCHWc_add                    fused_nn_contrib_conv2d_NCHWc_add                   33.58     1.853    (1, 20, 7, 7, 16)     3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  33.298    1.838    (1, 24, 14, 14, 16)   3       1        
fused_nn_pad_2                                       fused_nn_pad_2                                      33.201    1.832    (1, 9, 57, 57, 16)    1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  33.057    1.824    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  33.027    1.823    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  32.787    1.809    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_5                  fused_nn_contrib_conv2d_NCHWc_add_5                 32.332    1.784    (1, 2, 56, 56, 12)    3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip     fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    32.156    1.775    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1    fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    31.68     1.748    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2    fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    30.832    1.701    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_7             fused_nn_contrib_conv2d_NCHWc_add_clip_7            30.521    1.684    (1, 2, 112, 112, 16)  3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_11             fused_nn_contrib_conv2d_NCHWc_add_add_1             30.012    1.656    (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_1              fused_nn_contrib_conv2d_NCHWc_add_add_1             29.914    1.651    (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4  28.642    1.581    (1, 9, 28, 28, 16)    3       1        
fused_nn_global_avg_pool2d                           fused_nn_global_avg_pool2d                          28.552    1.576    (1, 80, 1, 1, 16)     1       1        
fused_layout_transform_40                            fused_layout_transform_40                           26.741    1.476    (1, 8, 56, 56, 12)    1       1        
fused_layout_transform_41                            fused_layout_transform_41                           25.793    1.423    (1, 12, 56, 56, 12)   1       1        
fused_nn_contrib_conv2d_NCHWc_add_add1               fused_nn_contrib_conv2d_NCHWc_add_add               25.759    1.422    (1, 10, 7, 7, 16)     4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_2              fused_nn_contrib_conv2d_NCHWc_add_add_2             25.566    1.411    (1, 4, 14, 14, 16)    4       1        
fused_nn_dense_add                                   fused_nn_dense_add                                  25.52     1.408    (1, 1000)             3       1        
fused_nn_contrib_conv2d_NCHWc_add_add                fused_nn_contrib_conv2d_NCHWc_add_add               25.391    1.401    (1, 10, 7, 7, 16)     4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_21            fused_nn_contrib_conv2d_NCHWc_add_clip_2            25.345    1.399    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_2             fused_nn_contrib_conv2d_NCHWc_add_clip_2            25.262    1.394    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_22            fused_nn_contrib_conv2d_NCHWc_add_clip_2            24.895    1.374    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_3              fused_nn_contrib_conv2d_NCHWc_add_add_3             24.679    1.362    (1, 2, 28, 28, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_31             fused_nn_contrib_conv2d_NCHWc_add_add_3             24.553    1.355    (1, 2, 28, 28, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_2                  fused_nn_contrib_conv2d_NCHWc_add_2                 23.364    1.289    (1, 6, 14, 14, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_21             fused_nn_contrib_conv2d_NCHWc_add_add_2             23.264    1.284    (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_22             fused_nn_contrib_conv2d_NCHWc_add_add_2             23.006    1.27     (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_11            fused_nn_contrib_conv2d_NCHWc_add_clip_1            22.724    1.254    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_32            fused_nn_contrib_conv2d_NCHWc_add_clip_3            22.722    1.254    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_41            fused_nn_contrib_conv2d_NCHWc_add_clip_4            22.522    1.243    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_1             fused_nn_contrib_conv2d_NCHWc_add_clip_1            22.247    1.228    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_33            fused_nn_contrib_conv2d_NCHWc_add_clip_3            21.648    1.195    (1, 24, 14, 14, 16)   3       1        
fused_nn_pad                                         fused_nn_pad                                        21.439    1.183    (1, 36, 15, 15, 16)   1       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_12            fused_nn_contrib_conv2d_NCHWc_add_clip_1            21.437    1.183    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_4                  fused_nn_contrib_conv2d_NCHWc_add_4                 21.426    1.182    (1, 2, 28, 28, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_1                  fused_nn_contrib_conv2d_NCHWc_add_1                 21.227    1.171    (1, 10, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_31            fused_nn_contrib_conv2d_NCHWc_add_clip_3            20.739    1.145    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_3             fused_nn_contrib_conv2d_NCHWc_add_clip_3            20.719    1.143    (1, 24, 14, 14, 16)   3       1        
fused_nn_softmax                                     fused_nn_softmax                                    19.798    1.093    (1, 1000)             1       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_42            fused_nn_contrib_conv2d_NCHWc_add_clip_4            19.751    1.09     (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_4             fused_nn_contrib_conv2d_NCHWc_add_clip_4            19.679    1.086    (1, 12, 28, 28, 16)   3       1        
fused_nn_pad_1                                       fused_nn_pad_1                                      18.729    1.034    (1, 12, 29, 29, 16)   1       1        
fused_nn_contrib_conv2d_NCHWc_add_3                  fused_nn_contrib_conv2d_NCHWc_add_3                 18.411    1.016    (1, 4, 14, 14, 16)    3       1        
fused_nn_pad_layout_transform                        fused_nn_pad_layout_transform                       18.159    1.002    (1, 1, 225, 225, 3)   1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3  15.938    0.88     (1, 12, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1  15.438    0.852    (1, 36, 7, 7, 16)     3       1        
fused_layout_transform_transpose_nn_batch_flatten    fused_layout_transform_transpose_nn_batch_flatten   1.563     0.086    (1, 1280)             1       1        
Total_time                                           -                                                   1812.033  -        -                     -       -        

Profile with INDEX_DEFAULT_I64=ON (slow)

Node Name                                            Ops                                                 Time(us)   Time(%)  Shape                 Inputs  Outputs  
---------                                            ---                                                 --------   -------  -----                 ------  -------  
fused_nn_contrib_conv2d_NCHWc_add_add_1              fused_nn_contrib_conv2d_NCHWc_add_add_1             3105.8     21.391   (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_11             fused_nn_contrib_conv2d_NCHWc_add_add_1             3104.62    21.382   (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_2              fused_nn_contrib_conv2d_NCHWc_add_add_2             2200.03    15.152   (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_21             fused_nn_contrib_conv2d_NCHWc_add_add_2             2189.84    15.082   (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_22             fused_nn_contrib_conv2d_NCHWc_add_add_2             2185.71    15.054   (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7  60.094     0.414    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9  52.82      0.364    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6  51.393     0.354    (1, 2, 112, 112, 16)  3       1        
fused_nn_pad_3                                       fused_nn_pad_3                                      51.19      0.353    (1, 6, 113, 113, 16)  1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5  49.058     0.338    (1, 6, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_6             fused_nn_contrib_conv2d_NCHWc_add_clip_6            46.637     0.321    (1, 6, 112, 112, 16)  3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  43.381     0.299    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8  40.165     0.277    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  39.355     0.271    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  39.205     0.27     (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_4              fused_nn_contrib_conv2d_NCHWc_add_add_4             38.595     0.266    (1, 2, 56, 56, 12)    4       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9  38.019     0.262    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8  37.559     0.259    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_5             fused_nn_contrib_conv2d_NCHWc_add_clip_5            36.159     0.249    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_51            fused_nn_contrib_conv2d_NCHWc_add_clip_5            35.269     0.243    (1, 9, 56, 56, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip               fused_nn_contrib_conv2d_NCHWc_add_clip              34.755     0.239    (1, 80, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_2                  fused_nn_contrib_conv2d_NCHWc_add_2                 34.248     0.236    (1, 6, 14, 14, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_6                  fused_nn_contrib_conv2d_NCHWc_add_6                 33.65      0.232    (1, 1, 112, 112, 16)  3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_7             fused_nn_contrib_conv2d_NCHWc_add_clip_7            33.163     0.228    (1, 2, 112, 112, 16)  3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21  fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2  32.593     0.224    (1, 24, 14, 14, 16)   3       1        
fused_nn_pad_2                                       fused_nn_pad_2                                      32.542     0.224    (1, 9, 57, 57, 16)    1       1        
fused_nn_contrib_conv2d_NCHWc_add                    fused_nn_contrib_conv2d_NCHWc_add                   32.471     0.224    (1, 20, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_5                  fused_nn_contrib_conv2d_NCHWc_add_5                 31.587     0.218    (1, 2, 56, 56, 12)    3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip     fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    30.659     0.211    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1    fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    30.109     0.207    (1, 60, 7, 7, 16)     3       1        
fused_nn_pad                                         fused_nn_pad                                        29.258     0.202    (1, 36, 15, 15, 16)   1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4  29.083     0.2      (1, 9, 28, 28, 16)    3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_2             fused_nn_contrib_conv2d_NCHWc_add_clip_2            28.273     0.195    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2    fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip    28.052     0.193    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_22            fused_nn_contrib_conv2d_NCHWc_add_clip_2            27.855     0.192    (1, 36, 14, 14, 16)   3       1        
fused_layout_transform_40                            fused_layout_transform_40                           27.811     0.192    (1, 8, 56, 56, 12)    1       1        
fused_nn_global_avg_pool2d                           fused_nn_global_avg_pool2d                          27.724     0.191    (1, 80, 1, 1, 16)     1       1        
fused_layout_transform_41                            fused_layout_transform_41                           27.308     0.188    (1, 12, 56, 56, 12)   1       1        
fused_nn_dense_add                                   fused_nn_dense_add                                  26.655     0.184    (1, 1000)             3       1        
fused_nn_contrib_conv2d_NCHWc_add_1                  fused_nn_contrib_conv2d_NCHWc_add_1                 26.406     0.182    (1, 10, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_add                fused_nn_contrib_conv2d_NCHWc_add_add               25.447     0.175    (1, 10, 7, 7, 16)     4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_21            fused_nn_contrib_conv2d_NCHWc_add_clip_2            25.433     0.175    (1, 36, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_add1               fused_nn_contrib_conv2d_NCHWc_add_add               25.276     0.174    (1, 10, 7, 7, 16)     4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_11            fused_nn_contrib_conv2d_NCHWc_add_clip_1            24.78      0.171    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_31             fused_nn_contrib_conv2d_NCHWc_add_add_3             24.132     0.166    (1, 2, 28, 28, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_12            fused_nn_contrib_conv2d_NCHWc_add_clip_1            23.359     0.161    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_add_3              fused_nn_contrib_conv2d_NCHWc_add_add_3             23.226     0.16     (1, 2, 28, 28, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_31            fused_nn_contrib_conv2d_NCHWc_add_clip_3            22.999     0.158    (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_1             fused_nn_contrib_conv2d_NCHWc_add_clip_1            22.372     0.154    (1, 60, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_41            fused_nn_contrib_conv2d_NCHWc_add_clip_4            21.948     0.151    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_4                  fused_nn_contrib_conv2d_NCHWc_add_4                 21.359     0.147    (1, 2, 28, 28, 16)    3       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1  21.269     0.146    (1, 36, 7, 7, 16)     3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_33            fused_nn_contrib_conv2d_NCHWc_add_clip_3            20.916     0.144    (1, 24, 14, 14, 16)   3       1        
fused_nn_softmax                                     fused_nn_softmax                                    20.415     0.141    (1, 1000)             1       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_3             fused_nn_contrib_conv2d_NCHWc_add_clip_3            20.37      0.14     (1, 24, 14, 14, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_4             fused_nn_contrib_conv2d_NCHWc_add_clip_4            19.395     0.134    (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_32            fused_nn_contrib_conv2d_NCHWc_add_clip_3            19.306     0.133    (1, 24, 14, 14, 16)   3       1        
fused_nn_pad_1                                       fused_nn_pad_1                                      19.284     0.133    (1, 12, 29, 29, 16)   1       1        
fused_nn_contrib_conv2d_NCHWc_add_clip_42            fused_nn_contrib_conv2d_NCHWc_add_clip_4            18.807     0.13     (1, 12, 28, 28, 16)   3       1        
fused_nn_contrib_conv2d_NCHWc_add_3                  fused_nn_contrib_conv2d_NCHWc_add_3                 17.728     0.122    (1, 4, 14, 14, 16)    3       1        
fused_nn_pad_layout_transform                        fused_nn_pad_layout_transform                       15.683     0.108    (1, 1, 225, 225, 3)   1       1        
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3   fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3  15.236     0.105    (1, 12, 14, 14, 16)   3       1        
fused_layout_transform_transpose_nn_batch_flatten    fused_layout_transform_transpose_nn_batch_flatten   1.607      0.011    (1, 1280)             1       1        
Total_time                                           -                                                   14519.449  -        -                     -       -        

The slowdown comes from these ops:

fused_nn_contrib_conv2d_NCHWc_add_add_1              fused_nn_contrib_conv2d_NCHWc_add_add_1             3105.8     21.391   (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_11             fused_nn_contrib_conv2d_NCHWc_add_add_1             3104.62    21.382   (1, 6, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_2              fused_nn_contrib_conv2d_NCHWc_add_add_2             2200.03    15.152   (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_21             fused_nn_contrib_conv2d_NCHWc_add_add_2             2189.84    15.082   (1, 4, 14, 14, 16)    4       1        
fused_nn_contrib_conv2d_NCHWc_add_add_22             fused_nn_contrib_conv2d_NCHWc_add_add_2             2185.71    15.054   (1, 4, 14, 14, 16)    4       1    

Here is a script to reproduce:

import time
import numpy as np
import tvm
from tvm import relay
from tvm.contrib import graph_runtime
import tensorflow as tf
input_shape = (1, 3, 224, 224)
model = tf.keras.applications.MobileNetV2()
mod, params = relay.frontend.from_keras(model, shape={'input_1': input_shape})
dtype = 'float32'
with relay.build_config(opt_level=3):
  graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512", params=params)
i_data = np.random.uniform(0, 1, input_shape).astype(dtype)
mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
mod.set_input(**params)
# Time
times = []
for i in range(100):
    start_time = time.time()
    mod.run(input_1=i_data)
    res = mod.get_output(0)
    times.append(time.time() - start_time)
print('Mean latency:', 1000.0 * np.mean(times[10:]))
print('Mean FPS:', 1.0 / np.mean(times[10:]))

Thanks!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions