Skip to content

Commit

Permalink
Feature/cpu (#1019)
Browse files Browse the repository at this point in the history
* add layernorm

* pass reduce

* add comment

* add layer norm test

* fix  layernorm

* fix layernorm

* add demo2

* fix build / add view

* update layernorm

* support layernorm of llama

* fix build

* add demo2

* pass ym

* pass demo2

* fix onnx external data importer

* fuse MHA of llama

* add more cpu kernels

* update MHA fusion

* reorder MHA weights

* add demo3

* add demo3 compute statge 1

* fix build

* fix __tdma_all_sync_apply

* add to v35

* dump const

* update demo3 golden

* compiled

* support multiple output compare

* fix MHA kernel

* resplit v2

* fix MHA kernel

* to v26

* push

* fix double free

* fix mha kernel

* fix  V35

* fix all

* support rmsnrom

* fix v22

* fix v22 v10

* pass v28

* pass v43

* remove dump

* add other part

* pass all llama65b decoder layers

* pass gather

* open 32 threads for demo4

* update binary/unary with external op

* fix codes using stdlib

* update kernel inputs

* Fix gather

* refactor cpu cmodel

* update demo head

* pass graph to tir

* fix head main

* pass norm case

* update demo names

* add xpu source gen

* Fix head kernel segment fault

* fix cost evaluator

* Fix head kernel cos similarity

* decoder layer pass input layernorm

* Add uanry demo

* pass v30 of decoder layer

* fix softmax

* Enable ImmOutput

* fix malloc

* remove debug macro

* Add ImmOut

* fix tdma store

* refactor cpu runtime

* refactor method table

* fix cpu test

* refactor auto distributed

* update cpu test

* fix rdata

* update cpu test with rdata

* fix typeinfer

* add XPU Op layernorm

* update layernorm cost

* fix cost evaluator

* fix layernorm

* add partial resplit

* add rvv matmul

* add codegen of cpu gather

* add concat/slice codegen

* merge

* add codegen of cpu softmax

* update slice cpu case

* fix slice

* fix cpu concat

* fix cpu concat

* Apply code-format changes

* fix build

* Apply code-format changes

* add codegen of transpose

* add reshape

* pass reshape2

* update stackvm

* merge

* fix build

* update compile

* fix to slicing

* fix negative axis

* fix matmul evaluator

* add NormAxis

* fix ToSlice

* fix matmul

* add GatherReduceScatter

* fix ToSlice

* refactor auto dist

* fix boxing partial to slice codegen

* softmax support split on axis

* add conv2d cpu kernel

* disable outter split on inner splited axis

* fix binary distributedtype infer

* fixGetPartialCandidateNDSBPs

* pass cpu conv2d

* support dilated conv2d

* add mha pattern

* add combine reshape transpose

* fix mha fusion/ add rules

* fix rdata map dispose

* add xpu reduce arg

* Apply code-format changes

* add VAE fusion

* fuse VAE

* support xpu instance norm

* Apply code-format changes

* add reduce arg

* Apply code-format changes

* fix to tir keep vars order

* Apply code-format changes

* add XPU resize

* Apply code-format changes

* fix resize cpu kernel op

* fix Resize

* Update layernom op for test

* Apply code-format changes

* fix conv2d kernel

* fix boxing with reshape

* fix build

* fix pytest compare

* add gelu kernels

* add xpu cast

* fix swish type infer

* support xpu expand

* Update layernorm rvv codes

* fix binary broadcast with distributed broadcast

* support multi outputs

* fix single output

* fix new linked section

* fuse Unet

* add cos dump

* fix build

* speed up onnx external data load

* add typeinfer case for binary/matmul

* move matmul rvv to kernels

* fix conv2d kernel

* fix Unet Fusion

* optimize dynamic onnx

* change fusion counter

* fix conv2d if split is partial

* split conv to conv+bias+clamp, and add xpu clamp

* update fusion merger

* fix slice with negative axis

* llama-4-decoder pass (x86/rv64)

* text encoder/vae decoder pass (x86/rv64)

* fix conan config

* Fix cpu/test compile

* fix cmake config

* fix synax err

* fix synax err

* normallize axes of slice

* disable module cpu on windows

* donot split softmax on axis

* add softmax kernel test

* add rvv instance norm

* clean modules dir

* add rvv clamp

* Clean modules dir

* fix match result

* Apply code-format changes

* Clean Tests

* fix csproj

* fix buffer schedule

* Add unet pytest

* add target's commands

* fix buffer and memspan hashcode and equals

* fix unitest

* fix unittest

* fix test_cli output dump

* fix command line

* fix type infer

* fix format

* fix all test

* Apply code-format changes

* fix merge

* Apply code-format changes

* fix merge

* fix runtime build

* fix kernel test build

* Apply code-format changes

* fix use mean

* Apply code-format changes

* optimize dot dump

* fix merge

* fix output when test cli

---------

Co-authored-by: xhuohai <[email protected]>
Co-authored-by: 郑启航 <[email protected]>
Co-authored-by: zhen8838 <[email protected]>
Co-authored-by: lerenhua <[email protected]>
Co-authored-by: liuzhiming <[email protected]>
Co-authored-by: liuzm6217-jianan <[email protected]>
  • Loading branch information
7 people authored Nov 7, 2023
1 parent 21eccd2 commit 338ba10
Show file tree
Hide file tree
Showing 243 changed files with 5,115 additions and 3,322 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ artifacts/
*.pidb
*.svclog
*.scc

*.bin
# Chutzpah Test files
_Chutzpah*

Expand Down Expand Up @@ -306,4 +306,4 @@ cmake-build-*
*gmodel_dump_dir*
*.ipynb_checkpoints*
# Auto generated files
# generated/
# generated/
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ endif()

if(NOT DEFINED NNCASE_VERSION_SUFFIX)
find_package (Git)

execute_process(
COMMAND ${GIT_EXECUTABLE} describe --always --dirty --tag
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
Expand Down Expand Up @@ -274,5 +273,5 @@ if(BUILD_TESTING)
endif()

# Modules
#add_subdirectory(modules/k210)

#add_subdirectory(modules/vulkan)
5 changes: 3 additions & 2 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@
<PackageVersion Include="OrtKISharp" Version="0.0.2" />
<PackageVersion Include="RazorLight" Version="2.3.0" />
<PackageVersion Include="Singulink.Collections.Weak" Version="1.0.2" />
<PackageVersion Include="StyleCop.Analyzers" Version="1.2.0-beta.507" />
<PackageVersion Include="System.CommandLine.Hosting" Version="0.3.0-alpha.21216.1" />
<PackageVersion Include="StyleCop.Analyzers" Version="1.2.0-beta.435" />
<PackageVersion Include="System.CommandLine.Hosting" Version="0.4.0-alpha.22272.1" />
<PackageVersion Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
<PackageVersion Include="System.Linq.Async" Version="6.0.1" />
<PackageVersion Include="System.Reactive" Version="5.0.0" />
<PackageVersion Include="Tomlyn.Extensions.Configuration" Version="1.0.5" />
Expand Down
295 changes: 0 additions & 295 deletions modules/Nncase.Modules.CPU/packages.lock.json

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (c) Canaan Inc. All rights reserved.
// Licensed under the Apache license. See LICENSE file in the project root for full license information.
/* This file is generated by tools/stackvm_gen/IsaGen at 2023/9/18 下午5:04:31 +08:00. */
/* This file is generated by tools/stackvm_gen/IsaGen at 9/20/2023 10:17:08 AM +00:00. */

using System;
using System.Collections.Generic;
Expand Down Expand Up @@ -59,7 +59,7 @@ private void EmitTensorCall(Op op)
Emitter.T.L2Normalization();
break;
case IR.NN.LayerNorm top:
Emitter.T.LayerNorm(top.Axis, top.Epsilon);
Emitter.T.LayerNorm(top.Axis, top.Epsilon, top.UseMean);
break;
case IR.NN.LeakyRelu top:
Emitter.T.LeakyRelu();
Expand Down Expand Up @@ -176,7 +176,7 @@ private void EmitTensorCall(Op op)
Emitter.T.Cast(top.NewType, top.CastMode);
break;
case IR.Tensors.Concat top:
Emitter.T.Concat();
Emitter.T.Concat(top.Axis);
break;
case IR.Tensors.ConstantOfShape top:
Emitter.T.ConstantOfShape();
Expand All @@ -191,7 +191,7 @@ private void EmitTensorCall(Op op)
Emitter.T.Flatten();
break;
case IR.Tensors.Gather top:
Emitter.T.Gather();
Emitter.T.Gather(top.Axis);
break;
case IR.Tensors.GatherElements top:
Emitter.T.GatherElements();
Expand All @@ -205,9 +205,6 @@ private void EmitTensorCall(Op op)
case IR.Tensors.IndexOf top:
Emitter.T.IndexOf();
break;
case IR.Tensors.LSTM top:
Emitter.T.LSTM(top.Direction, top.Layout, top.Activations);
break;
case IR.Tensors.Prod top:
Emitter.T.Prod();
break;
Expand Down Expand Up @@ -289,6 +286,9 @@ private void EmitTensorCall(Op op)
case IR.ShapeExpr.UnsqueezeShape top:
Emitter.T.UnsqueezeShape();
break;
case IR.RNN.LSTM top:
Emitter.T.LSTM(top.Direction, top.Layout, top.Activations);
break;
case IR.Random.Normal top:
Emitter.T.Normal(top.Type);
break;
Expand Down
Loading

0 comments on commit 338ba10

Please sign in to comment.