Skip to content

Commit

Permalink
[test] downgrade tokenizer to 0.13 to fix unhandle ge.Scalar Aten IR …
Browse files Browse the repository at this point in the history
…node

Signed-off-by: Avimitin <[email protected]>
  • Loading branch information
Avimitin committed Aug 30, 2024
1 parent b04364f commit 48c5bb1
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 3 deletions.
24 changes: 21 additions & 3 deletions nix/pkgs/buddy-mlir.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,32 @@
, ninja
, llvmPackages_17
, fetchFromGitHub
, fetchpatch
, fetchurl
, rustPlatform
, python3
, callPackage
}:
let
stdenv = llvmPackages_17.stdenv;
bintools = llvmPackages_17.bintools;

downgradedPyPkgs = python3.override {
packageOverrides = final: prev: {
tokenizers = (final.callPackage ./tokenizer-013.nix { });

transformers = (prev.transformers.overridePythonAttrs (old: rec {
version = "4.33.1";

src = fetchFromGitHub {
owner = "huggingface";
repo = "transformers";
rev = "refs/tags/v${version}";
hash = "sha256-Z78I9S8g9WexoX6HhxwbOD0K0awCTzsqW1ZiWObQNw0=";
};
}));
};
};

buddy-llvm = callPackage ./buddy-llvm.nix { inherit stdenv python3; };
self = stdenv.mkDerivation {
pname = "buddy-mlir";
Expand Down Expand Up @@ -49,12 +67,12 @@ let
llvm = buddy-llvm;

# Below three fields are black magic that allow site-packages automatically imported with nixpkgs hooks
pythonModule = python3;
pythonModule = downgradedPyPkgs;
pythonPath = [ ];
requiredPythonModules = [ ];

# nix run buddy-mlir.pyenv to start a python with PyTorch/LLVM MLIR/Buddy Frontend support
pyenv = python3.withPackages (ps: [
pyenv = downgradedPyPkgs.withPackages (ps: [
self
ps.torch

Expand Down
146 changes: 146 additions & 0 deletions nix/pkgs/tokenizer-013.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
{ lib
, buildPythonPackage
, datasets
, fetchFromGitHub
, fetchurl
, numpy
, openssl
, pkg-config
, pytestCheckHook
, pythonOlder
, requests
, rustPlatform
, setuptools-rust
}:

let
# See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
# about URLs and file names
robertaVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
};
robertaMerges = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
};
albertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
};
bertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
};
norvigBig = fetchurl {
url = "https://norvig.com/big.txt";
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
};
docPipelineTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
};
docQuicktourTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
};
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
};
openaiMerges = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
};
cargoLock = fetchurl {
url = "https://raw.githubusercontent.com/NixOS/nixpkgs/f2ca6788911a1a160089a0575bdfa4170c4e4f83/pkgs/development/python-modules/tokenizers/Cargo.lock";
hash = "sha256-KPU7eCrir9oemFhfsdmvOcRjaunQncpszwinRg0FQUE=";
};
in
buildPythonPackage rec {
pname = "tokenizers";
version = "0.13.3";

disabled = pythonOlder "3.7";

src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
hash = "sha256-QZG5jmr3vbyQs4mVBjwVDR31O66dUM+p39R0htJ1umk=";
};

postPatch = ''
ln -s ${cargoLock} Cargo.lock
'';

RUSTFLAGS = "-A invalid_reference_casting";
cargoDeps = rustPlatform.importCargoLock {
lockFile = cargoLock;
};

sourceRoot = "source/bindings/python";

nativeBuildInputs = [
pkg-config
setuptools-rust
] ++ (with rustPlatform; [
cargoSetupHook
rust.cargo
rust.rustc
]);

buildInputs = [
openssl
];

propagatedBuildInputs = [
numpy
];

nativeCheckInputs = [
datasets
pytestCheckHook
requests
];

postUnpack = ''
# Add data files for tests, otherwise tests attempt network access
mkdir $sourceRoot/tests/data
( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json
ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
ln -s ${docPipelineTokenizer} bert-wiki.json
ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )
'';

preCheck = ''
export HOME=$(mktemp -d);
'';

pythonImportsCheck = [
"tokenizers"
];

disabledTests = [
# Downloads data using the datasets module
"TestTrainFromIterators"
# Those tests require more data
"test_from_pretrained"
"test_from_pretrained_revision"
"test_continuing_prefix_trainer_mistmatch"
];

meta = with lib; {
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
homepage = "https://github.com/huggingface/tokenizers";
license = licenses.asl20;
maintainers = with maintainers; [ ];
platforms = platforms.unix;
};
}

0 comments on commit 48c5bb1

Please sign in to comment.