Skip to content

Latest commit

 

History

History
 
 

embedding

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 
 
 
 
 

embedding

0x00 说明

包含以下内容:

  • embedding_f32_kernel
  • embedding_f32x4_kernel(float4向量化版本)
  • embedding_f32x4_pack_kernel(float4向量化,pack版本版本)
  • embedding_f16_kernel(fp16版本)
  • embedding_f16x8_kernel(fp16向量化版本)
  • embedding_f16x8_pack_kernel(fp16向量化,pack版本)
  • PyTorch bindings

测试

# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada 
python3 embedding.py

一个elemwise的操作,但是有个值得探究的问题,在f16下pack的性能优于没有pack的性能但是在f32下相反 输出:

--------------------------------------------------------------------------------------------------------------
                                             MaxV=1024, SeqLen=2048, EmbSize=512
                out_f32: ['0.69075936  ', '0.56517494  ', '-0.12546943 '], time:0.005317ms
              out_f32x4: ['0.69075936  ', '0.56517494  ', '-0.12546943 '], time:0.004125ms
         out_f32x4_pack: ['0.69075936  ', '0.56517494  ', '-0.12546943 '], time:0.004017ms
             out_f32_th: ['0.69075936  ', '0.56517494  ', '-0.12546943 '], time:0.012147ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['-1.27734375 ', '-0.92822266 ', '-1.4453125  '], time:0.005090ms
              out_f16x8: ['-1.27734375 ', '-0.92822266 ', '-1.4453125  '], time:0.004089ms
         out_f16x8_pack: ['-1.27734375 ', '-0.92822266 ', '-1.4453125  '], time:0.004041ms
             out_f16_th: ['-1.27734375 ', '-0.92822266 ', '-1.4453125  '], time:0.011230ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=1024, SeqLen=2048, EmbSize=1024
                out_f32: ['-1.34922504 ', '-0.04674992 ', '1.24448562  '], time:0.011468ms
              out_f32x4: ['-1.34922504 ', '-0.04674992 ', '1.24448562  '], time:0.005364ms
         out_f32x4_pack: ['-1.34922504 ', '-0.04674992 ', '1.24448562  '], time:0.005448ms
             out_f32_th: ['-1.34922504 ', '-0.04674992 ', '1.24448562  '], time:0.037062ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['-0.47412109 ', '-0.47070312 ', '1.41894531  '], time:0.011039ms
              out_f16x8: ['-0.47412109 ', '-0.47070312 ', '1.41894531  '], time:0.004971ms
         out_f16x8_pack: ['-0.47412109 ', '-0.47070312 ', '1.41894531  '], time:0.004065ms
             out_f16_th: ['-0.47412109 ', '-0.47070312 ', '1.41894531  '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=1024, SeqLen=4096, EmbSize=512
                out_f32: ['0.30875409  ', '0.98055625  ', '-0.86661887 '], time:0.008464ms
              out_f32x4: ['0.30875409  ', '0.98055625  ', '-0.86661887 '], time:0.005400ms
         out_f32x4_pack: ['0.30875409  ', '0.98055625  ', '-0.86661887 '], time:0.005484ms
             out_f32_th: ['0.30875409  ', '0.98055625  ', '-0.86661887 '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['-1.04492188 ', '0.44750977  ', '-1.25878906 '], time:0.008070ms
              out_f16x8: ['-1.04492188 ', '0.44750977  ', '-1.25878906 '], time:0.005186ms
         out_f16x8_pack: ['-1.04492188 ', '0.44750977  ', '-1.25878906 '], time:0.004339ms
             out_f16_th: ['-1.04492188 ', '0.44750977  ', '-1.25878906 '], time:0.016415ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=1024, SeqLen=4096, EmbSize=1024
                out_f32: ['-0.07979927 ', '-0.20634571 ', '0.9166832   '], time:0.021017ms
              out_f32x4: ['-0.07979927 ', '-0.20634571 ', '0.9166832   '], time:0.008297ms
         out_f32x4_pack: ['-0.07979927 ', '-0.20634571 ', '0.9166832   '], time:0.008714ms
             out_f32_th: ['-0.07979927 ', '-0.20634571 ', '0.9166832   '], time:0.037730ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['-2.19726562 ', '0.50439453  ', '1.40917969  '], time:0.020158ms
              out_f16x8: ['-2.19726562 ', '0.50439453  ', '1.40917969  '], time:0.007451ms
         out_f16x8_pack: ['-2.19726562 ', '0.50439453  ', '1.40917969  '], time:0.005496ms
             out_f16_th: ['-2.19726562 ', '0.50439453  ', '1.40917969  '], time:0.030172ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=4096, SeqLen=2048, EmbSize=512
                out_f32: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.005329ms
              out_f32x4: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.004041ms
         out_f32x4_pack: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.004005ms
             out_f32_th: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.011110ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['0.14440918  ', '0.26367188  ', '0.77539062  '], time:0.005066ms
              out_f16x8: ['0.14440918  ', '0.26367188  ', '0.77539062  '], time:0.004041ms
         out_f16x8_pack: ['0.14440918  ', '0.26367188  ', '0.77539062  '], time:0.003982ms
             out_f16_th: ['0.14440918  ', '0.26367188  ', '0.77539062  '], time:0.011170ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=4096, SeqLen=2048, EmbSize=1024
                out_f32: ['1.26920044  ', '0.12124556  ', '0.38764721  '], time:0.011575ms
              out_f32x4: ['1.26920044  ', '0.12124556  ', '0.38764721  '], time:0.005329ms
         out_f32x4_pack: ['1.26920044  ', '0.12124556  ', '0.38764721  '], time:0.005519ms
             out_f32_th: ['1.26920044  ', '0.12124556  ', '0.38764721  '], time:0.016499ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['0.49243164  ', '-0.44116211 ', '-0.14611816 '], time:0.011182ms
              out_f16x8: ['0.49243164  ', '-0.44116211 ', '-0.14611816 '], time:0.004947ms
         out_f16x8_pack: ['0.49243164  ', '-0.44116211 ', '-0.14611816 '], time:0.004029ms
             out_f16_th: ['0.49243164  ', '-0.44116211 ', '-0.14611816 '], time:0.016606ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=4096, SeqLen=4096, EmbSize=512
                out_f32: ['-1.29970169 ', '-1.45127702 ', '-0.7259807  '], time:0.008452ms
              out_f32x4: ['-1.29970169 ', '-1.45127702 ', '-0.7259807  '], time:0.005424ms
         out_f32x4_pack: ['-1.29970169 ', '-1.45127702 ', '-0.7259807  '], time:0.005460ms
             out_f32_th: ['-1.29970169 ', '-1.45127702 ', '-0.7259807  '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['0.88623047  ', '0.00491714  ', '0.38525391  '], time:0.008094ms
              out_f16x8: ['0.88623047  ', '0.00491714  ', '0.38525391  '], time:0.005186ms
         out_f16x8_pack: ['0.88623047  ', '0.00491714  ', '0.38525391  '], time:0.004339ms
             out_f16_th: ['0.88623047  ', '0.00491714  ', '0.38525391  '], time:0.016451ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
                                             MaxV=4096, SeqLen=4096, EmbSize=1024
                out_f32: ['0.62899369  ', '0.55146962  ', '-0.03596229 '], time:0.021005ms
              out_f32x4: ['0.62899369  ', '0.55146962  ', '-0.03596229 '], time:0.008404ms
         out_f32x4_pack: ['0.62899369  ', '0.55146962  ', '-0.03596229 '], time:0.008738ms
             out_f32_th: ['0.62899369  ', '0.55146962  ', '-0.03596229 '], time:0.056100ms
--------------------------------------------------------------------------------------------------------------
                out_f16: ['1.13085938  ', '1.19628906  ', '-0.61035156 '], time:0.020254ms
              out_f16x8: ['1.13085938  ', '1.19628906  ', '-0.61035156 '], time:0.007451ms
         out_f16x8_pack: ['1.13085938  ', '1.19628906  ', '-0.61035156 '], time:0.005472ms
             out_f16_th: ['1.13085938  ', '1.19628906  ', '-0.61035156 '], time:0.030160ms
--------------------------------------------------------------------------------------------------------------