From 6e9ea182b3770eefb17873b22090daa85c4bcce9 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 30 Mar 2023 10:34:47 -0400 Subject: [PATCH 01/24] Adjust ci config for staging repo --- taskcluster/ci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 5d33cdb8a..03b403e76 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -3,10 +3,10 @@ trust-domain: "translations" task-priority: low taskgraph: - cached-task-prefix: "translations.v2.firefox-translations-training" + cached-task-prefix: "translations.v2.staging-firefox-translations-training" repositories: firefox_translations_training: - name: "firefox-translations-training" + name: "staging-firefox-translations-training" workers: aliases: From 494f1589e66d2a2ea953b05368e35dbecaacf88d Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 31 Mar 2023 13:36:24 -0400 Subject: [PATCH 02/24] Use bhearsum's taskgraph repo for now It contains a couple of changes that have yet to be upstreamed to taskgraph. This also requires that we disable pip hash verification for the moment. --- .taskcluster.yml | 1 + taskcluster/requirements.in | 2 +- taskcluster/requirements.txt | 230 ++++++++--------------------------- 3 files changed, 52 insertions(+), 181 deletions(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index 57a277c8b..f3ce55e26 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -183,6 +183,7 @@ tasks: ${normProjectUpper}_HEAD_REV: '${head_sha}' ${normProjectUpper}_REPOSITORY_TYPE: git ${normProjectUpper}_PIP_REQUIREMENTS: taskcluster/requirements.txt + PIP_DISABLE_REQUIRE_HASHES: "1" REPOSITORIES: $json: ${normProject}: ${normProject} diff --git a/taskcluster/requirements.in b/taskcluster/requirements.in index 83e95267e..94b80a7cb 100644 --- a/taskcluster/requirements.in +++ b/taskcluster/requirements.in @@ -1 +1 @@ -taskcluster-taskgraph>=4.2.0 +git+https://github.com/bhearsum/taskgraph@fetch-multiple-artifacts diff --git a/taskcluster/requirements.txt b/taskcluster/requirements.txt index 7f341876d..1ad679ac4 100644 --- a/taskcluster/requirements.txt +++ b/taskcluster/requirements.txt @@ -1,201 +1,71 @@ # -# This file is autogenerated by pip-compile with python 3.8 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: # -# pip-compile --generate-hashes requirements.in +# pip-compile requirements.in # -appdirs==1.4.4 \ - --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \ - --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 +appdirs==1.4.4 # via taskcluster-taskgraph -attrs==22.2.0 \ - --hash=sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836 \ - --hash=sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99 +arrow==1.2.3 + # via jinja2-time +attrs==22.2.0 # via taskcluster-taskgraph -certifi==2022.12.7 \ - --hash=sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3 \ - --hash=sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18 +binaryornot==0.4.4 + # via cookiecutter +certifi==2022.12.7 # via requests -charset-normalizer==3.0.1 \ - --hash=sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b \ - --hash=sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42 \ - --hash=sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d \ - --hash=sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b \ - --hash=sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a \ - --hash=sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59 \ - --hash=sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154 \ - --hash=sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1 \ - --hash=sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c \ - --hash=sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a \ - --hash=sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d \ - --hash=sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6 \ - --hash=sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b \ - --hash=sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b \ - --hash=sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783 \ - --hash=sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5 \ - --hash=sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918 \ - --hash=sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555 \ - --hash=sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639 \ - --hash=sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786 \ - --hash=sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e \ - --hash=sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed \ - --hash=sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820 \ - --hash=sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8 \ - --hash=sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3 \ - --hash=sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541 \ - --hash=sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14 \ - --hash=sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be \ - --hash=sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e \ - --hash=sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76 \ - --hash=sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b \ - --hash=sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c \ - --hash=sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b \ - --hash=sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3 \ - --hash=sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc \ - --hash=sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6 \ - --hash=sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59 \ - --hash=sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4 \ - --hash=sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d \ - --hash=sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d \ - --hash=sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3 \ - --hash=sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a \ - --hash=sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea \ - --hash=sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6 \ - --hash=sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e \ - --hash=sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603 \ - --hash=sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24 \ - --hash=sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a \ - --hash=sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58 \ - --hash=sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678 \ - --hash=sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a \ - --hash=sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c \ - --hash=sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6 \ - --hash=sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18 \ - --hash=sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174 \ - --hash=sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317 \ - --hash=sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f \ - --hash=sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc \ - --hash=sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837 \ - --hash=sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41 \ - --hash=sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c \ - --hash=sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579 \ - --hash=sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753 \ - --hash=sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8 \ - --hash=sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291 \ - --hash=sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087 \ - --hash=sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866 \ - --hash=sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3 \ - --hash=sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d \ - --hash=sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1 \ - --hash=sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca \ - --hash=sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e \ - --hash=sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db \ - --hash=sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72 \ - --hash=sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d \ - --hash=sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc \ - --hash=sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539 \ - --hash=sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d \ - --hash=sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af \ - --hash=sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b \ - --hash=sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602 \ - --hash=sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f \ - --hash=sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478 \ - --hash=sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c \ - --hash=sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e \ - --hash=sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479 \ - --hash=sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7 \ - --hash=sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8 +chardet==5.1.0 + # via binaryornot +charset-normalizer==3.0.1 # via requests -giturlparse==0.10.0 \ - --hash=sha256:04ba1a3a099c3093fa8d24a422913c6a9b2c2cd22bcffc939cf72e3e98f672d7 \ - --hash=sha256:2595ab291d30717cda8474b874c9fd509f1b9802ad7f6968c36a45e4b13eb337 +click==8.1.3 + # via cookiecutter +cookiecutter==2.1.1 + # via taskcluster-taskgraph +giturlparse==0.10.0 # via mozilla-repo-urls -idna==3.4 \ - --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ - --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 +idna==3.4 # via requests -json-e==4.5.0 \ - --hash=sha256:618a94aecc8b8bc7733d6cd0ee7b676e45675566625a38958aa8b30379d9758f \ - --hash=sha256:e733ce77b4acbbc2c48211057f8cb5af45999e6be4ce0f07585c5580df45826e - # via taskcluster-taskgraph -mozilla-repo-urls==0.1.1 \ - --hash=sha256:30510d3519479aa70211145d0ac9cf6e2fadcb8d30fa3b196bb957bd773502ba \ - --hash=sha256:7364da790751db2a060eb45adbf1d7db89a145ed279ba235f3425db9dd255915 +jinja2==3.1.2 + # via + # cookiecutter + # jinja2-time +jinja2-time==0.2.0 + # via cookiecutter +json-e==4.5.0 # via taskcluster-taskgraph -pyyaml==6.0 \ - --hash=sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf \ - --hash=sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293 \ - --hash=sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b \ - --hash=sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57 \ - --hash=sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b \ - --hash=sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4 \ - --hash=sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07 \ - --hash=sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba \ - --hash=sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9 \ - --hash=sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287 \ - --hash=sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513 \ - --hash=sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0 \ - --hash=sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782 \ - --hash=sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0 \ - --hash=sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92 \ - --hash=sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f \ - --hash=sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2 \ - --hash=sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc \ - --hash=sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1 \ - --hash=sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c \ - --hash=sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86 \ - --hash=sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4 \ - --hash=sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c \ - --hash=sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34 \ - --hash=sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b \ - --hash=sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d \ - --hash=sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c \ - --hash=sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb \ - --hash=sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7 \ - --hash=sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737 \ - --hash=sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3 \ - --hash=sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d \ - --hash=sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358 \ - --hash=sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53 \ - --hash=sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78 \ - --hash=sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803 \ - --hash=sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a \ - --hash=sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f \ - --hash=sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174 \ - --hash=sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5 +markupsafe==2.1.2 + # via jinja2 +mozilla-repo-urls==0.1.1 # via taskcluster-taskgraph -redo==2.0.4 \ - --hash=sha256:81066955041c853b0e6491eb65a0877dce45131c4cfa3d42d923fc2aa8f7a043 \ - --hash=sha256:c76e4c23ab2f8840261736a851323cd98493710e7a9d36a1058535dca501f293 +python-dateutil==2.8.2 + # via arrow +python-slugify==8.0.1 + # via cookiecutter +pyyaml==6.0 + # via + # cookiecutter + # taskcluster-taskgraph +redo==2.0.4 # via taskcluster-taskgraph -requests==2.28.2 \ - --hash=sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa \ - --hash=sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf +requests==2.28.2 # via + # cookiecutter # requests-unixsocket # taskcluster-taskgraph -requests-unixsocket==0.3.0 \ - --hash=sha256:28304283ea9357d45fff58ad5b11e47708cfbf5806817aa59b2a363228ee971e \ - --hash=sha256:c685c680f0809e1b2955339b1e5afc3c0022b3066f4f7eb343f43a6065fc0e5d +requests-unixsocket==0.3.0 # via taskcluster-taskgraph -slugid==2.0.0 \ - --hash=sha256:a950d98b72691178bdd4d6c52743c4a2aa039207cf7a97d71060a111ff9ba297 \ - --hash=sha256:aec8b0e01c4ad32e38e12d609eab3ec912fd129aaf6b2ded0199b56a5f8fd67c +six==1.16.0 + # via python-dateutil +slugid==2.0.0 # via taskcluster-taskgraph -taskcluster-taskgraph==4.2.0 \ - --hash=sha256:2a8c1ce1796307c67bda5321fd7c810af49da5a791dbcfe3c14cc3d46ec8b1a3 \ - --hash=sha256:d0fa3b4079e7aacaf0b84d502b82d5b4bd37a8f1ed23cc6381c898c05755de79 +taskcluster-taskgraph @ git+https://github.com/bhearsum/taskgraph@fetch-multiple-artifacts # via -r requirements.in -taskcluster-urls==13.0.1 \ - --hash=sha256:5e25e7e6818e8877178b175ff43d2e6548afad72694aa125f404a7329ece0973 \ - --hash=sha256:b25e122ecec249c4299ac7b20b08db76e3e2025bdaeb699a9d444556de5fd367 \ - --hash=sha256:f66dcbd6572a6216ab65949f0fa0b91f2df647918028436c384e6af5cd12ae2b +taskcluster-urls==13.0.1 # via taskcluster-taskgraph -urllib3==1.26.14 \ - --hash=sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72 \ - --hash=sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1 +text-unidecode==1.3 + # via python-slugify +urllib3==1.26.14 # via requests -voluptuous==0.13.1 \ - --hash=sha256:4b838b185f5951f2d6e8752b68fcf18bd7a9c26ded8f143f92d6d28f3921a3e6 \ - --hash=sha256:e8d31c20601d6773cb14d4c0f42aee29c6821bbd1018039aac7ac5605b489723 +voluptuous==0.13.1 # via taskcluster-taskgraph From 18f7e9f67ffc1c71df7ee2543fd3776ef0c4f550 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 29 Mar 2023 20:14:24 -0400 Subject: [PATCH 03/24] Get rid of hello kind now that we know that Taskcluster works --- taskcluster/ci/hello/kind.yml | 21 -------------- .../transforms/hello.py | 28 ------------------- 2 files changed, 49 deletions(-) delete mode 100644 taskcluster/ci/hello/kind.yml delete mode 100644 taskcluster/translations_taskgraph/transforms/hello.py diff --git a/taskcluster/ci/hello/kind.yml b/taskcluster/ci/hello/kind.yml deleted file mode 100644 index 818b87e90..000000000 --- a/taskcluster/ci/hello/kind.yml +++ /dev/null @@ -1,21 +0,0 @@ ---- -loader: taskgraph.loader.transform:loader - -transforms: - - translations_taskgraph.transforms.hello:transforms - - taskgraph.transforms.job:transforms - - taskgraph.transforms.task:transforms - -task-defaults: - worker-type: t-linux-large - worker: - docker-image: {in-tree: linux} - max-run-time: 1800 - -tasks: - world: - noun: world - run: - using: run-task - command: >- - echo "Hello $NOUN!" diff --git a/taskcluster/translations_taskgraph/transforms/hello.py b/taskcluster/translations_taskgraph/transforms/hello.py deleted file mode 100644 index 25dd7f159..000000000 --- a/taskcluster/translations_taskgraph/transforms/hello.py +++ /dev/null @@ -1,28 +0,0 @@ -from voluptuous import ALLOW_EXTRA, Required - -from taskgraph.transforms.base import TransformSequence -from taskgraph.util.schema import Schema - -transforms = TransformSequence() - -HELLO_SCHEMA = Schema( - { - Required("noun"): str, - }, - extra=ALLOW_EXTRA, -) - -transforms = TransformSequence() -transforms.add_validate(HELLO_SCHEMA) - - -@transforms.add -def add_noun(config, tasks): - for task in tasks: - noun = task.pop("noun").capitalize() - task["description"] = f"Prints 'Hello {noun}'" - - env = task.setdefault("worker", {}).setdefault("env", {}) - env["NOUN"] = noun - - yield task From 9f38ed06452804b252f71550c321f71edd1145f6 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 6 Apr 2023 16:30:16 -0400 Subject: [PATCH 04/24] Add worker type for b-linux-large, for more CPU intensive tasks; reformat yaml in ci/config.yml --- taskcluster/ci/config.yml | 45 ++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 03b403e76..45bbb98a5 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -3,25 +3,30 @@ trust-domain: "translations" task-priority: low taskgraph: - cached-task-prefix: "translations.v2.staging-firefox-translations-training" - repositories: - firefox_translations_training: - name: "staging-firefox-translations-training" + cached-task-prefix: "translations.v2.staging-firefox-translations-training" + repositories: + firefox_translations_training: + name: "staging-firefox-translations-training" workers: - aliases: - b-linux: - provisioner: '{trust-domain}-{level}' - implementation: docker-worker - os: linux - worker-type: '{alias}-gcp' - images: - provisioner: '{trust-domain}-{level}' - implementation: docker-worker - os: linux - worker-type: '{alias}-gcp' - t-linux-large: - provisioner: '{trust-domain}-t' - implementation: docker-worker - os: linux - worker-type: '{alias}-gcp' + aliases: + b-linux: + provisioner: '{trust-domain}-{level}' + implementation: docker-worker + os: linux + worker-type: '{alias}-gcp' + b-linux-large: + provisioner: '{trust-domain}-{level}' + implementation: docker-worker + os: linux + worker-type: '{alias}-gcp' + images: + provisioner: '{trust-domain}-{level}' + implementation: docker-worker + os: linux + worker-type: '{alias}-gcp' + t-linux-large: + provisioner: '{trust-domain}-t' + implementation: docker-worker + os: linux + worker-type: '{alias}-gcp' From 7e8a1761e2894c1f507f5dd85f0bb0afc1442398 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 14 Apr 2023 19:39:47 -0400 Subject: [PATCH 05/24] Add yamllint config for taskcluster files --- taskcluster/.yamllint | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 taskcluster/.yamllint diff --git a/taskcluster/.yamllint b/taskcluster/.yamllint new file mode 100644 index 000000000..79cf45ae6 --- /dev/null +++ b/taskcluster/.yamllint @@ -0,0 +1,7 @@ +--- +extends: default +rules: + line-length: + max: 120 + indentation: + spaces: 4 From 039d3db86c85dfdede6ac250fc349ac5b1845365 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 29 Mar 2023 21:22:47 -0400 Subject: [PATCH 06/24] Add toolchain tasks for things that we depend on to train language models. Most of these are straight forward download and compiles, but there's a few callouts: - The CLI tools (marian, fast-align, etc.) already have build scripts used by the existing pipeline. For the most part, I'm just replacing them with my own version because they're just unpack/make/cmake. The exception is Marian, which has a little bit more going on with cmake definitions. Maybe I should just copy those in here though? - Some Python modules that don't have binary wheels available, which we ought to build to avoid needing to compile them at the start of training tasks. - CUDA (a NVIDIA Toolkit) is a huge pain. They don't have any real advertised way to just dump the files you want into a directory (they want you to run an installer). I _think_ I managed to get this work, but it's possible this will need a tweak in the future if a future task has trouble with the current toolchain. This also necessitated switching Docker images to Ubuntu, because some tools were not reasonably possible to make work on Alpine. --- taskcluster/ci/config.yml | 5 + taskcluster/ci/docker-image/kind.yml | 6 +- taskcluster/ci/fetch/kind.yml | 17 +++ taskcluster/ci/fetch/python.yml | 22 ++++ taskcluster/ci/fetch/toolchains.yml | 42 ++++++ taskcluster/ci/toolchain/kind.yml | 121 ++++++++++++++++++ taskcluster/docker/base/Dockerfile | 45 +++++++ taskcluster/docker/linux/Dockerfile | 32 ----- taskcluster/docker/toolchain-build/Dockerfile | 36 ++++++ .../scripts/toolchain/build-cuda-toolkit.sh | 24 ++++ .../scripts/toolchain/build-extract-lex.sh | 12 ++ .../scripts/toolchain/build-fast-align.sh | 12 ++ .../scripts/toolchain/build-hunspell.sh | 11 ++ taskcluster/scripts/toolchain/build-kenlm.sh | 19 +++ taskcluster/scripts/toolchain/build-marian.sh | 19 +++ .../scripts/toolchain/build-preprocess.sh | 12 ++ 16 files changed, 402 insertions(+), 33 deletions(-) create mode 100644 taskcluster/ci/fetch/kind.yml create mode 100644 taskcluster/ci/fetch/python.yml create mode 100644 taskcluster/ci/fetch/toolchains.yml create mode 100644 taskcluster/ci/toolchain/kind.yml create mode 100644 taskcluster/docker/base/Dockerfile delete mode 100644 taskcluster/docker/linux/Dockerfile create mode 100644 taskcluster/docker/toolchain-build/Dockerfile create mode 100755 taskcluster/scripts/toolchain/build-cuda-toolkit.sh create mode 100755 taskcluster/scripts/toolchain/build-extract-lex.sh create mode 100755 taskcluster/scripts/toolchain/build-fast-align.sh create mode 100755 taskcluster/scripts/toolchain/build-hunspell.sh create mode 100755 taskcluster/scripts/toolchain/build-kenlm.sh create mode 100755 taskcluster/scripts/toolchain/build-marian.sh create mode 100755 taskcluster/scripts/toolchain/build-preprocess.sh diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 45bbb98a5..12672f5f7 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -1,6 +1,11 @@ --- trust-domain: "translations" task-priority: low +treeherder: + group-names: + "I": "Docker images" + "Fetch": "Fetching tasks" + "TL": "Toolchain tasks" taskgraph: cached-task-prefix: "translations.v2.staging-firefox-translations-training" diff --git a/taskcluster/ci/docker-image/kind.yml b/taskcluster/ci/docker-image/kind.yml index cc2480a81..400745d85 100644 --- a/taskcluster/ci/docker-image/kind.yml +++ b/taskcluster/ci/docker-image/kind.yml @@ -7,4 +7,8 @@ transforms: - taskgraph.transforms.task:transforms tasks: - linux: {} + base: + symbol: Base + toolchain-build: + parent: base + symbol: TL diff --git a/taskcluster/ci/fetch/kind.yml b/taskcluster/ci/fetch/kind.yml new file mode 100644 index 000000000..34789a8d4 --- /dev/null +++ b/taskcluster/ci/fetch/kind.yml @@ -0,0 +1,17 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +loader: taskgraph.loader.transform:loader + +transforms: + - taskgraph.transforms.fetch:transforms + - taskgraph.transforms.job:transforms + - taskgraph.transforms.task:transforms + +task-defaults: + docker-image: {in-tree: base} + +tasks-from: + - python.yml + - toolchains.yml diff --git a/taskcluster/ci/fetch/python.yml b/taskcluster/ci/fetch/python.yml new file mode 100644 index 000000000..d0aa1d87a --- /dev/null +++ b/taskcluster/ci/fetch/python.yml @@ -0,0 +1,22 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +hunspell: + description: Hunspell 0.5.5 source + fetch: + type: static-url + url: https://files.pythonhosted.org/packages/e5/fe/cfc4dfd92c3a37c34d2806d5b84f9981bf3520db20149f8ee1a61f6fc69d/hunspell-0.5.5.tar.gz + sha256: 0f830b68bd8c392f4d5b4e21c38e28809e14d64ec67bde48272c920b63686f53 + size: 34609 + artifact-name: hunspell.tar.zst + strip-components: 1 + add-prefix: hunspell/ + +kenlm: + description: kenlm + fetch: + type: git + repo: https://github.com/kpu/kenlm + revision: 4e6ac85c8d01ac91cb61dfbdc76cd652158c5969 + path-prefix: kenlm-source diff --git a/taskcluster/ci/fetch/toolchains.yml b/taskcluster/ci/fetch/toolchains.yml new file mode 100644 index 000000000..956384240 --- /dev/null +++ b/taskcluster/ci/fetch/toolchains.yml @@ -0,0 +1,42 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +marian: + description: Marian + fetch: + type: git + repo: https://github.com/marian-nmt/marian-dev + revision: e8a1a2530fb84cbff7383302ebca393e5875c441 + path-prefix: marian-source + include-dot-git: true + +fast-align: + description: fast_align + fetch: + type: git + repo: https://github.com/clab/fast_align + revision: cab1e9aac8d3bb02ff5ae58218d8d225a039fa11 + +preprocess: + description: preprocess + fetch: + type: git + repo: https://github.com/kpu/preprocess + revision: 64307314b4d5a9a0bd529b5c1036b0710d995eec + +extract-lex: + description: extract-lex + fetch: + type: git + repo: https://github.com/marian-nmt/extract-lex + revision: 42fa605b53f32eaf6c6e0b5677255c21c91b3d49 + +cuda: + description: CUDA 12.1.0 Source + fetch: + type: static-url + url: https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run # yamllint disable-line rule:line-length + sha256: 68699036c12d71adb9ad2799dce2ff070270fab4488b90920b9756ab3f52c41c + size: 4245586997 + artifact-name: cuda-source.run diff --git a/taskcluster/ci/toolchain/kind.yml b/taskcluster/ci/toolchain/kind.yml new file mode 100644 index 000000000..492afe9ff --- /dev/null +++ b/taskcluster/ci/toolchain/kind.yml @@ -0,0 +1,121 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +loader: taskgraph.loader.transform:loader + +kind-dependencies: + - fetch + +transforms: + - taskgraph.transforms.job:transforms + - taskgraph.transforms.cached_tasks:transforms + - taskgraph.transforms.task:transforms + +task-defaults: + worker-type: b-linux + worker: + docker-image: {"in-tree": "toolchain-build"} + max-run-time: 3600 + env: {} + treeherder: + platform: toolchain/opt + run: + using: toolchain-script + +tasks: + cuda-toolkit: + description: CUDA Toolkit preparation + treeherder: + symbol: TL(CUDA) + run: + script: build-cuda-toolkit.sh + resources: + - taskcluster/scripts/toolchain/build-cuda-toolkit.sh + toolchain-artifact: public/build/cuda-toolkit.tar.zst + fetches: + fetch: + - cuda + + # TODO: probably need to make sure that these are all built statically? + marian: + description: Marian + treeherder: + symbol: TL(Marian) + worker-type: b-linux-large + run: + script: build-marian.sh + resources: + - taskcluster/scripts/toolchain/build-marian.sh + - pipeline/setup/compile-marian.sh + toolchain-artifact: public/build/marian.tar.zst + fetches: + fetch: + - marian + toolchain: + - cuda-toolkit + + fast-align: + description: fast_align + treeherder: + symbol: TL(FA) + run: + script: build-fast-align.sh + resources: + - taskcluster/scripts/toolchain/build-fast-align.sh + toolchain-artifact: public/build/fast-align.tar.zst + fetches: + fetch: + - fast-align + + preprocess: + description: preprocess + treeherder: + symbol: TL(PP) + run: + script: build-preprocess.sh + resources: + - taskcluster/scripts/toolchain/build-preprocess.sh + toolchain-artifact: public/build/dedupe + fetches: + fetch: + - preprocess + + extract-lex: + description: extract-lex + treeherder: + symbol: TL(Lex) + run: + script: build-extract-lex.sh + resources: + - taskcluster/scripts/toolchain/build-extract-lex.sh + toolchain-artifact: public/build/extract_lex + fetches: + fetch: + - extract-lex + + kenlm: + description: kenlm + treeherder: + symbol: TL(kenlm) + run: + script: build-kenlm.sh + resources: + - taskcluster/scripts/toolchain/build-kenlm.sh + toolchain-artifact: public/build/kenlm-0.0.0-cp310-cp310-linux_x86_64.whl + fetches: + fetch: + - kenlm + + hunspell: + description: build hunspell binary wheel + treeherder: + symbol: TL(hunspell) + run: + script: build-hunspell.sh + resources: + - taskcluster/scripts/toolchain/build-hunspell.sh + toolchain-artifact: public/build/hunspell-0.5.5-cp310-cp310-linux_x86_64.whl + fetches: + fetch: + - hunspell diff --git a/taskcluster/docker/base/Dockerfile b/taskcluster/docker/base/Dockerfile new file mode 100644 index 000000000..d5b99ab43 --- /dev/null +++ b/taskcluster/docker/base/Dockerfile @@ -0,0 +1,45 @@ +FROM ubuntu:22.04 +LABEL maintainer="Mozilla Release Engineering " + +# Add worker user +RUN mkdir /builds && \ + useradd -d /builds/worker -s /bin/bash -m worker && \ + chown worker:worker /builds/worker && \ + mkdir /builds/worker/artifacts && \ + chown worker:worker /builds/worker/artifacts + +WORKDIR /builds/worker/ + +#---------------------------------------------------------------------------------------------------------------------- +#-- Configuration ----------------------------------------------------------------------------------------------------- +#---------------------------------------------------------------------------------------------------------------------- + +ENV CURL='curl --location --retry 5' \ + LANG='en_US.UTF-8' \ + TERM='dumb' + +RUN apt-get update -qq \ + # We need to install tzdata before all of the other packages. Otherwise it will show an interactive dialog that + # we cannot navigate while building the Docker image. + && apt-get install -y tzdata \ + && apt-get install -y python3 \ + python3-pip \ + python3-yaml \ + locales \ + git \ + && apt-get clean + +RUN locale-gen "$LANG" + +RUN pip install zstandard + +# %include-run-task + +ENV SHELL=/bin/bash \ + HOME=/builds/worker \ + PATH="/builds/worker/.local/bin:$PATH" + +VOLUME /builds/worker/checkouts +VOLUME /builds/worker/.cache + +USER root diff --git a/taskcluster/docker/linux/Dockerfile b/taskcluster/docker/linux/Dockerfile deleted file mode 100644 index 4875437ca..000000000 --- a/taskcluster/docker/linux/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM alpine:latest -LABEL maintainer="Mozilla Release Engineering " - -# Add worker user -RUN mkdir /builds && \ - adduser -h /builds/worker -s /bin/ash -D worker && \ - mkdir /builds/worker/artifacts && \ - chown worker:worker /builds/worker/artifacts - -# Update repositories -RUN apk update - -# Setup Python -RUN apk add --no-cache python3 && \ - ln -sf python3 /usr/bin/python && \ - python3 -m ensurepip && \ - pip3 install --no-cache --upgrade pip setuptools - -# Setup other dependencies -RUN apk add bash git - -# %include-run-task - -ENV SHELL=/bin/ash \ - HOME=/builds/worker \ - PATH=/builds/worker/.local/bin:$PATH - -VOLUME /builds/worker/checkouts -VOLUME /builds/worker/.cache - -# Set a default command useful for debugging -CMD ["/bin/ash"] diff --git a/taskcluster/docker/toolchain-build/Dockerfile b/taskcluster/docker/toolchain-build/Dockerfile new file mode 100644 index 000000000..7055d19c8 --- /dev/null +++ b/taskcluster/docker/toolchain-build/Dockerfile @@ -0,0 +1,36 @@ +FROM $DOCKER_IMAGE_PARENT +LABEL maintainer="Mozilla Release Engineering " + +RUN apt-get update -qq \ + # We need to install tzdata before all of the other packages. Otherwise it will show an interactive dialog that + # we cannot navigate while building the Docker image. + && apt-get install -y tzdata \ + && apt-get install -y wget \ + zip \ + build-essential \ + gcc \ + g++ \ + make \ + cmake \ + libboost-dev \ + libboost-all-dev \ + zstd \ + tar \ + libxml2 \ + libhunspell-dev \ + && apt-get clean + +RUN locale-gen "$LANG" + +RUN pip install zstandard + +# %include-run-task + +ENV SHELL=/bin/bash \ + HOME=/builds/worker \ + PATH="/builds/worker/.local/bin:$PATH" + +VOLUME /builds/worker/checkouts +VOLUME /builds/worker/.cache + +USER root diff --git a/taskcluster/scripts/toolchain/build-cuda-toolkit.sh b/taskcluster/scripts/toolchain/build-cuda-toolkit.sh new file mode 100755 index 000000000..729e78a9c --- /dev/null +++ b/taskcluster/scripts/toolchain/build-cuda-toolkit.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e +set -x + +# CUDA installers do not have a silent mode of operation that allows to run +# them without also installing GPU drivers and other unnecessary things. +# Instead, we extract the raw contents of the installer, and then combine +# the extracted contents into a tarball. + +export CUDA_INSTALLER=$MOZ_FETCHES_DIR/cuda-source.run + +TARFILE=$UPLOAD_DIR/cuda-toolkit.tar + +chmod +x $CUDA_INSTALLER +# This installer advertises a `--extract` option which put +# the contents in a directory of our choosing...but it doesn't +# work when run under alpine linux docker containers. Instead, +# we can use these secret options, which will extract to `pkg` +# in the current working directory. The files we care about +# will end up in `pkg/builds`. +EXTRACT_DIR="$(pwd)/cuda-toolkit" +$CUDA_INSTALLER --toolkit --toolkitpath=$EXTRACT_DIR --silent + +tar --zstd -cf $TARFILE.zst cuda-toolkit diff --git a/taskcluster/scripts/toolchain/build-extract-lex.sh b/taskcluster/scripts/toolchain/build-extract-lex.sh new file mode 100755 index 000000000..3401959cb --- /dev/null +++ b/taskcluster/scripts/toolchain/build-extract-lex.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e +set -x + +EXTRACT_LEX_DIR=$MOZ_FETCHES_DIR/extract-lex + +build_dir=$(mktemp -d) +cd $build_dir +cmake $EXTRACT_LEX_DIR +make -j$(nproc) + +cp $build_dir/extract_lex $UPLOAD_DIR diff --git a/taskcluster/scripts/toolchain/build-fast-align.sh b/taskcluster/scripts/toolchain/build-fast-align.sh new file mode 100755 index 000000000..38102d64a --- /dev/null +++ b/taskcluster/scripts/toolchain/build-fast-align.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e +set -x + +FAST_ALIGN_DIR=$MOZ_FETCHES_DIR/fast_align + +build_dir=$(mktemp -d) +cd $build_dir +cmake $FAST_ALIGN_DIR +make -j$(nproc) + +tar -c $build_dir/fast_align $build_dir/atools | zstd > $UPLOAD_DIR/fast-align.tar.zst diff --git a/taskcluster/scripts/toolchain/build-hunspell.sh b/taskcluster/scripts/toolchain/build-hunspell.sh new file mode 100755 index 000000000..0453b7356 --- /dev/null +++ b/taskcluster/scripts/toolchain/build-hunspell.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e +set -x + +HUNSPELL_DIR=$MOZ_FETCHES_DIR/hunspell + +cd $HUNSPELL_DIR +python3 setup.py bdist_wheel +whl=$(ls dist/*.whl) + +cp $whl $UPLOAD_DIR/ diff --git a/taskcluster/scripts/toolchain/build-kenlm.sh b/taskcluster/scripts/toolchain/build-kenlm.sh new file mode 100755 index 000000000..13fcb108d --- /dev/null +++ b/taskcluster/scripts/toolchain/build-kenlm.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e +set -x + +KENLM_DIR=$MOZ_FETCHES_DIR/kenlm-source + +# TODO: I don't think we actually need the C++ stuff? just the python module +# build_dir=$(mktemp -d) +# cd $build_dir +# cmake $KENLM_DIR -DKENLM_MAX_ORDER=7 +# make -j$(nproc) + +cd $KENLM_DIR +# Install these separately so they will install as wheels. +# Using `--build-option` below disables wheels even for dependencies. +pip install setuptools wheel cmake +MAX_ORDER=7 python3 setup.py bdist_wheel +find . +cp $KENLM_DIR/dist/kenlm-0.0.0-cp310-cp310-linux_x86_64.whl $UPLOAD_DIR/ diff --git a/taskcluster/scripts/toolchain/build-marian.sh b/taskcluster/scripts/toolchain/build-marian.sh new file mode 100755 index 000000000..b453cbe89 --- /dev/null +++ b/taskcluster/scripts/toolchain/build-marian.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e +set -x + +export MARIAN_DIR=$MOZ_FETCHES_DIR/marian-source +export CUDA_DIR=$MOZ_FETCHES_DIR/cuda-toolkit + +# TODO: consider not calling out to this since it's such a simple script... +bash $VCS_PATH/pipeline/setup/compile-marian.sh "${MARIAN_DIR}/build" "$(nproc)" + +cd $MARIAN_DIR/build +tar --zstd -cf $UPLOAD_DIR/marian.tar.zst \ + "marian" \ + "marian-decoder" \ + "marian-scorer" \ + "marian-conv" \ + "spm_train" \ + "spm_encode" \ + "spm_export_vocab" diff --git a/taskcluster/scripts/toolchain/build-preprocess.sh b/taskcluster/scripts/toolchain/build-preprocess.sh new file mode 100755 index 000000000..e6935db0b --- /dev/null +++ b/taskcluster/scripts/toolchain/build-preprocess.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e +set -x + +PREPROCESS_DIR=$MOZ_FETCHES_DIR/preprocess + +build_dir=$(mktemp -d) +cd $build_dir +cmake $PREPROCESS_DIR -DBUILD_TYPE=Release +make -j$(nproc) + +cp $build_dir/bin/dedupe $UPLOAD_DIR From b515a254e2c717d3e4a8a6f6ebad0578d4540962 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 31 Mar 2023 13:42:28 -0400 Subject: [PATCH 07/24] Bump decision task image --- .taskcluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index f3ce55e26..b1b137a02 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -203,7 +203,7 @@ tasks: features: taskclusterProxy: true - image: mozillareleases/taskgraph:decision-10378fde0bf12adbd64e74313bf72ea3c6caf311ad6af23e2bff1d8f1232a221@sha256:7518c410bdf91142b0e26455d26ddaf861202cfbb3c35d0b1ef85d1ed577a5bd + image: mozillareleases/taskgraph:decision-5483484ad45a3d27a0f5bd05f1c87d90e08df67a3713605d812b851a8a5bd854@sha256:ef132cc5741539f846a85bbe0cebc3c9ead30b8f24c1da46c55363f2170c3993 maxRunTime: 1800 command: From 5eb20076c44d08ad295a89d66db2e6b038765824 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 31 Mar 2023 13:31:57 -0400 Subject: [PATCH 08/24] Add tasks to fetch a few dataset types I initially tried to implement these as `fetch` tasks. This failed because the way that we get so many of these is just not compatible with the idea of a static url or having one file per task. Eg: many of these datasets are fetched by running a python script. (In theory this could be reverse engineered, but I just don't think it's worth it...especially if URLs or metadata ends up changing in the future.) Instead, we're making use of the existing pipeline scripts that know how to fetch these. As you can see, the kind generates tasks named after the provider, dataset, and locale pair. I'm not certain this is what we want to do long term (there's going to be an absurd number of tasks after we finish adding all of the datasets and language pairs)....but I think it's OK for now. We probably ought to revisit this before we start running full training pipelines - if we change it after that we'll end up rebuilding tasks due to having no cached tasks for the new names. This revision also builds out a couple of transforms that are used here, and will be used elsewhere: * One that can substitute provider name, dataset (in a few forms), and locale pairs into tasks. This is necessary to avoid needing to repeat things such as commands, treeherder symbols, etc. * Another one that configures caching, using attributes defined in the kind. Eventually we're going to be using all sorts of action task parameters as part of the cache digest -- so it's important that we can specify these things per-task. --- taskcluster/ci/config.yml | 52 +++++++++ taskcluster/ci/dataset/kind.yml | 106 ++++++++++++++++++ taskcluster/ci/docker-image/kind.yml | 3 + taskcluster/docker/toolchain-build/Dockerfile | 4 + taskcluster/docker/train/Dockerfile | 15 +++ .../transforms/cache.py | 23 ++++ .../transforms/dataset_substitutions.py | 78 +++++++++++++ 7 files changed, 281 insertions(+) create mode 100644 taskcluster/ci/dataset/kind.yml create mode 100644 taskcluster/docker/train/Dockerfile create mode 100644 taskcluster/translations_taskgraph/transforms/cache.py create mode 100644 taskcluster/translations_taskgraph/transforms/dataset_substitutions.py diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 12672f5f7..169d52492 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -6,6 +6,11 @@ treeherder: "I": "Docker images" "Fetch": "Fetching tasks" "TL": "Toolchain tasks" + "flores": "flores" + "sacrebleu": "sacrebleu" + "opus": "opus" + "mtdata": "mtdata" + "news-crawl": "news-crawl" taskgraph: cached-task-prefix: "translations.v2.staging-firefox-translations-training" @@ -13,6 +18,53 @@ taskgraph: firefox_translations_training: name: "staging-firefox-translations-training" +# It's not exactly _ideal_ to have all of the locale pairs for each dataset +# specified in this file, but it's very difficult (if not impossible) to +# generate the right `dataset` tasks without it. We _could_ attempt to pull +# the locale pair for all datasets listed here, but not all locales or pairs +# exist for each dataset...which means we'll end up with failures that block +# the rest of the graph. +# There may be other ways to make this work, but in the short term this is +# the most straightforward solution. +datasets: + flores: + dev: + - src: en + trg: ru + - src: en + trg: fr + devtest: + - src: en + trg: ru + - src: en + trg: fr + + sacrebleu: + wmt19: + - src: en + trg: ru + wmt20: + - src: en + trg: ru + + opus: + ada83/v1: + - src: en + trg: ru + GNOME/v1: + - src: en + trg: ru + + news-crawl: + news.2020: + - src: en + trg: ru + + mtdata: + Neulab-tedtalks_train-1-eng-rus: + - src: en + trg: ru + workers: aliases: b-linux: diff --git a/taskcluster/ci/dataset/kind.yml b/taskcluster/ci/dataset/kind.yml new file mode 100644 index 000000000..897904397 --- /dev/null +++ b/taskcluster/ci/dataset/kind.yml @@ -0,0 +1,106 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This kind primarily exists because these dataset fetches break +# some assumptions made the `job` transforms that treat the `fetch` +# kind specially. +--- +loader: taskgraph.loader.transform:loader + +transforms: + - translations_taskgraph.transforms.dataset_substitutions:transforms + - taskgraph.transforms.job:transforms + - translations_taskgraph.transforms.cache:transforms + - taskgraph.transforms.cached_tasks:transforms + - taskgraph.transforms.task:transforms + +task-defaults: + worker-type: b-linux + attributes: + cache-type: dataset + substitution-fields: + - name + - label + - treeherder.symbol + worker: + docker-image: {in-tree: toolchain-build} + max-run-time: 1800 + artifacts: + - name: public/build + path: /builds/worker/artifacts + type: directory + + treeherder: + symbol: "{provider}({dataset_short}-{src_locale}-{trg_locale})" + platform: dataset/opt + run-on-tasks-for: [] + run: + using: run-task + +tasks: + flores: + description: Fetch flores101 dataset + label: dataset-flores-{dataset}-{src_locale}-{trg_locale} + provider: flores + attributes: + cache-resources: + - pipeline/data/importers/corpus/flores.sh + run: + command: + - bash + - -c + - $VCS_PATH/pipeline/data/importers/corpus/flores.sh {src_locale} {trg_locale} /builds/worker/artifacts/{dataset} {dataset} + + sacrebleu: + description: Fetch sacrebleu dataset + label: dataset-sacrebleu-{dataset}-{src_locale}-{trg_locale} + provider: sacrebleu + attributes: + cache-resources: + - pipeline/data/importers/corpus/sacrebleu.sh + run: + command: + - bash + - -c + - $VCS_PATH/pipeline/data/importers/corpus/sacrebleu.sh {src_locale} {trg_locale} /builds/worker/artifacts/{dataset} {dataset} + + opus: + description: Fetch opus dataset + # No slashes version of dataset used here because slashes break caches + label: dataset-opus-{dataset_no_slashes}-{src_locale}-{trg_locale} + provider: opus + attributes: + cache-resources: + - pipeline/data/importers/corpus/opus.sh + run: + command: + - bash + - -c + - $VCS_PATH/pipeline/data/importers/corpus/opus.sh {src_locale} {trg_locale} /builds/worker/artifacts/{dataset_no_slashes} {dataset} + + mtdata: + description: Fetch mtdata dataset + label: dataset-mtdata-{dataset}-{src_locale}-{trg_locale} + provider: mtdata + attributes: + cache-resources: + - pipeline/data/importers/corpus/mtdata.sh + run: + command: + - bash + - -c + - $VCS_PATH/pipeline/data/importers/corpus/mtdata.sh {src_locale} {trg_locale} /builds/worker/artifacts/{dataset} {dataset} + + news-crawl: + description: Fetch news-crawl dataset + label: dataset-news-crawl-{dataset}-{src_locale}-{trg_locale} + provider: news-crawl + attributes: + cache-resources: + - pipeline/data/importers/mono/news-crawl.sh + run: + command: + - bash + - -c + - $VCS_PATH/pipeline/data/importers/mono/news-crawl.sh {src_locale} /builds/worker/artifacts/{dataset}.{src_locale} {dataset} && $VCS_PATH/pipeline/data/importers/mono/news-crawl.sh {trg_locale} /builds/worker/artifacts/{dataset}.{trg_locale} {dataset} diff --git a/taskcluster/ci/docker-image/kind.yml b/taskcluster/ci/docker-image/kind.yml index 400745d85..429e9e429 100644 --- a/taskcluster/ci/docker-image/kind.yml +++ b/taskcluster/ci/docker-image/kind.yml @@ -9,6 +9,9 @@ transforms: tasks: base: symbol: Base + train: + parent: base + symbol: Train toolchain-build: parent: base symbol: TL diff --git a/taskcluster/docker/toolchain-build/Dockerfile b/taskcluster/docker/toolchain-build/Dockerfile index 7055d19c8..14c642a27 100644 --- a/taskcluster/docker/toolchain-build/Dockerfile +++ b/taskcluster/docker/toolchain-build/Dockerfile @@ -6,6 +6,7 @@ RUN apt-get update -qq \ # we cannot navigate while building the Docker image. && apt-get install -y tzdata \ && apt-get install -y wget \ + curl \ zip \ build-essential \ gcc \ @@ -24,6 +25,9 @@ RUN locale-gen "$LANG" RUN pip install zstandard +# Required to download sacrebleu datasets +RUN pip install sacrebleu mtdata + # %include-run-task ENV SHELL=/bin/bash \ diff --git a/taskcluster/docker/train/Dockerfile b/taskcluster/docker/train/Dockerfile new file mode 100644 index 000000000..58588d800 --- /dev/null +++ b/taskcluster/docker/train/Dockerfile @@ -0,0 +1,15 @@ +FROM $DOCKER_IMAGE_PARENT +LABEL maintainer="Mozilla Release Engineering " + +RUN apt-get update -qq \ + && apt-get install -y python3-numpy \ + python3-fasttext \ + parallel \ + zstd \ + bc \ + && apt-get clean + +# Required to download sacrebleu datasets +RUN pip install sacrebleu + +VOLUME /builds/worker/checkouts diff --git a/taskcluster/translations_taskgraph/transforms/cache.py b/taskcluster/translations_taskgraph/transforms/cache.py new file mode 100644 index 000000000..aaed198b6 --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/cache.py @@ -0,0 +1,23 @@ +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.hash import hash_path + +transforms = TransformSequence() + +@transforms.add +def add_cache(config, jobs): + for job in jobs: + cache_type = job["attributes"]["cache-type"] + cache_resources = job["attributes"]["cache-resources"] + digest_data = [] + + if cache_resources: + for r in cache_resources: + digest_data.append(hash_path(r)) + + job["cache"] = { + "type": cache_type, + "name": job["label"], + "digest-data": digest_data, + } + + yield job diff --git a/taskcluster/translations_taskgraph/transforms/dataset_substitutions.py b/taskcluster/translations_taskgraph/transforms/dataset_substitutions.py new file mode 100644 index 000000000..393a5ae2a --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/dataset_substitutions.py @@ -0,0 +1,78 @@ +import copy + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import Schema +from voluptuous import ALLOW_EXTRA, Required + +SCHEMA = Schema( + { + Required("substitution-fields"): [str], + Required("provider"): str, + }, + extra=ALLOW_EXTRA, +) + +transforms = TransformSequence() +transforms.add_validate(SCHEMA) + + +def substitute(item, **subs): + if isinstance(item, list): + for i in range(len(item)): + item[i] = substitute(item[i], **subs) + elif isinstance(item, dict): + new_dict = {} + for k, v in item.items(): + k = k.format(**subs) + new_dict[k] = substitute(v, **subs) + item = new_dict + elif isinstance(item, str): + item = item.format(**subs) + else: + item = item + + return item + + +def shorten_dataset_name(dataset): + """Shortens various dataset names. Mainly used to make sure we can have + useful Treeherder symbols.""" + # TODO: should the replacements live in ci/config.yml? + return (dataset + .replace("new-crawl", "nc") + .replace("news.2020", "n2020") + .replace("Neulab-tedtalks_train-1", "Ntt1") + ) + +@transforms.add +def render_command(config, jobs): + for job in jobs: + provider = job.pop("provider") + substitution_fields = job.pop("substitution-fields") + + for dataset, locale_pairs in config.graph_config["datasets"][provider].items(): + for pair in locale_pairs: + subjob = copy.deepcopy(job) + subs = { + "provider": provider, + "dataset": dataset, + "dataset_short": shorten_dataset_name(dataset), + "dataset_no_slashes": dataset.replace("/", "."), + "src_locale": pair["src"], + "trg_locale": pair["trg"], + } + for field in substitution_fields: + container, subfield = subjob, field + while "." in subfield: + f, subfield = field.split(".", 1) + container = container[f] + + container[subfield] = substitute(container[subfield], **subs) + + # If the job has command-context, add these values there + # as well. These helps to avoid needing two levels of + # substitution in a command. + if subjob.get("run", {}).get("command-context"): + subjob["run"]["command-context"].update(subs) + + yield subjob From 571a221a9669d7bf16fce405a84d5ae0c89493f8 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 14 Apr 2023 19:57:33 -0400 Subject: [PATCH 09/24] Add configuration for black and ruff for python formatting --- taskcluster/pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 taskcluster/pyproject.toml diff --git a/taskcluster/pyproject.toml b/taskcluster/pyproject.toml new file mode 100644 index 000000000..f41671d99 --- /dev/null +++ b/taskcluster/pyproject.toml @@ -0,0 +1,7 @@ +[tool.ruff] +line-length = 120 +target-version = "py37" + +[tool.black] +line-length = 120 +target-version = ["py37"] From 835a98d54a1e3c99c7b9535fee6e437d7ee4c51e Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 13 Apr 2023 12:59:06 -0400 Subject: [PATCH 10/24] Add `clean` stage of the training pipeline This is largely built on the earlier work done on the `dataset` kind. --- taskcluster/ci/clean/kind.yml | 95 +++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 taskcluster/ci/clean/kind.yml diff --git a/taskcluster/ci/clean/kind.yml b/taskcluster/ci/clean/kind.yml new file mode 100644 index 000000000..66504a862 --- /dev/null +++ b/taskcluster/ci/clean/kind.yml @@ -0,0 +1,95 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- + +loader: taskgraph.loader.transform:loader + +transforms: + - translations_taskgraph.transforms.dataset_substitutions:transforms + - taskgraph.transforms.job:transforms + - translations_taskgraph.transforms.cache:transforms + - taskgraph.transforms.cached_tasks:transforms + - taskgraph.transforms.task:transforms + +kind-dependencies: + - dataset + +task-defaults: + description: Clean {provider} {dataset} dataset {src_locale}-{trg_locale} + attributes: + cache-type: dataset + cache-resources: + - pipeline/clean/clean-corpus.sh + - pipeline/clean/tools/deescape-special-chars.perl + - pipeline/clean/tools/remove-non-printing-char.perl + - pipeline/clean/tools/clean_parallel.py + - pipeline/clean/tools/langid_fasttext.py + worker-type: b-linux + substitution-fields: + - description + - name + - dependencies + - fetches + - treeherder.symbol + - worker.env + worker: + docker-image: {"in-tree": "train"} + max-run-time: 3600 + artifacts: + - name: public/build + path: /builds/worker/artifacts + type: directory + env: + SRC: "{src_locale}" + TRG: "{trg_locale}" + + # Don't run unless explicitly scheduled + run-on-tasks-for: [] + + treeherder: + symbol: "{provider}({dataset_short}-{src_locale}-{trg_locale})" + platform: clean/opt + run: + using: run-task + command: + - bash + - -c + - $VCS_PATH/pipeline/clean/clean-corpus.sh $MOZ_FETCHES_DIR/{dataset_no_slashes} /builds/worker/artifacts/{dataset_no_slashes} auto {dataset} + dependencies: + "{provider}": dataset-{provider}-{dataset_no_slashes}-{src_locale}-{trg_locale} + fetches: + "{provider}": + - artifact: "{dataset_no_slashes}.{src_locale}.zst" + extract: false + - artifact: "{dataset_no_slashes}.{trg_locale}.zst" + extract: false + +tasks: + flores-{dataset}-{src_locale}-{trg_locale}: + provider: flores + + sacrebleu-{dataset}-{src_locale}-{trg_locale}: + provider: sacrebleu + + opus-{dataset_no_slashes}-{src_locale}-{trg_locale}: + provider: opus + + mtdata-{dataset}-{src_locale}-{trg_locale}: + provider: mtdata + attributes: + cache-resources: + - pipeline/clean/fixes/mtdata_JW300.mt.sh + - pipeline/clean/fixes/mtdata_JW300.sh + - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh + - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh + - pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh + - pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh + - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh + - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh + + news-crawl-{dataset}-{src_locale}-{trg_locale}: + provider: news-crawl From 851035c7f79f782c8de88204cff9f81a5f7dc747 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 13 Apr 2023 12:59:17 -0400 Subject: [PATCH 11/24] Update pipeline scripts to work with Taskcluster These is a few things: 1) Mark all the shell scripts as +x 2) Switch out pigz/gz for zstdmt/zst (in progress) 3) Add support for `auto` where `threads` is an argument, which uses `nproc` to decide how many threads to use (in progress). 4) Use `curl` instead of `wget` (in progress) --- .../generate-alignment-and-shortlist.sh | 0 pipeline/bicleaner/bicleaner.sh | 33 ++++++----- pipeline/bicleaner/download-pack.sh | 0 pipeline/cefilter/ce-filter.sh | 0 pipeline/cefilter/score.sh | 0 pipeline/clean/clean-corpus.sh | 56 ++++++++++--------- pipeline/clean/clean-mono.sh | 7 ++- pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh | 0 pipeline/clean/merge-corpus.sh | 0 pipeline/clean/merge-mono.sh | 0 pipeline/data/download-corpus.sh | 0 pipeline/data/download-mono.sh | 0 .../data/importers/corpus/custom-corpus.sh | 0 pipeline/data/importers/corpus/flores.sh | 11 ++-- pipeline/data/importers/corpus/mtdata.sh | 15 +++-- pipeline/data/importers/corpus/opus.sh | 7 ++- pipeline/data/importers/corpus/sacrebleu.sh | 7 ++- pipeline/data/importers/mono/commoncrawl.sh | 0 pipeline/data/importers/mono/custom-mono.sh | 0 pipeline/data/importers/mono/news-crawl.sh | 7 ++- .../data/importers/mono/paracrawl-mono.sh | 0 pipeline/eval/eval-gpu.sh | 0 pipeline/eval/eval-quantized.sh | 0 pipeline/eval/eval.sh | 0 pipeline/quantize/export.sh | 0 pipeline/quantize/quantize.sh | 0 pipeline/setup/compile-extract-lex.sh | 0 pipeline/setup/compile-fast-align.sh | 0 pipeline/setup/compile-marian.sh | 0 pipeline/setup/compile-preprocess.sh | 0 pipeline/setup/install-deps.sh | 0 pipeline/setup/install-kenlm.sh | 0 pipeline/train/train-student.sh | 0 pipeline/train/train.sh | 0 pipeline/translate/collect.sh | 0 pipeline/translate/merge-corpus.sh | 0 pipeline/translate/split-corpus.sh | 0 pipeline/translate/split-mono.sh | 0 38 files changed, 88 insertions(+), 55 deletions(-) mode change 100644 => 100755 pipeline/alignment/generate-alignment-and-shortlist.sh mode change 100644 => 100755 pipeline/bicleaner/bicleaner.sh mode change 100644 => 100755 pipeline/bicleaner/download-pack.sh mode change 100644 => 100755 pipeline/cefilter/ce-filter.sh mode change 100644 => 100755 pipeline/cefilter/score.sh mode change 100644 => 100755 pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh mode change 100644 => 100755 pipeline/clean/merge-corpus.sh mode change 100644 => 100755 pipeline/clean/merge-mono.sh mode change 100644 => 100755 pipeline/data/download-corpus.sh mode change 100644 => 100755 pipeline/data/download-mono.sh mode change 100644 => 100755 pipeline/data/importers/corpus/custom-corpus.sh mode change 100644 => 100755 pipeline/data/importers/corpus/flores.sh mode change 100644 => 100755 pipeline/data/importers/corpus/mtdata.sh mode change 100644 => 100755 pipeline/data/importers/corpus/opus.sh mode change 100644 => 100755 pipeline/data/importers/corpus/sacrebleu.sh mode change 100644 => 100755 pipeline/data/importers/mono/commoncrawl.sh mode change 100644 => 100755 pipeline/data/importers/mono/custom-mono.sh mode change 100644 => 100755 pipeline/data/importers/mono/news-crawl.sh mode change 100644 => 100755 pipeline/data/importers/mono/paracrawl-mono.sh mode change 100644 => 100755 pipeline/eval/eval-gpu.sh mode change 100644 => 100755 pipeline/eval/eval-quantized.sh mode change 100644 => 100755 pipeline/eval/eval.sh mode change 100644 => 100755 pipeline/quantize/export.sh mode change 100644 => 100755 pipeline/quantize/quantize.sh mode change 100644 => 100755 pipeline/setup/compile-extract-lex.sh mode change 100644 => 100755 pipeline/setup/compile-fast-align.sh mode change 100644 => 100755 pipeline/setup/compile-marian.sh mode change 100644 => 100755 pipeline/setup/compile-preprocess.sh mode change 100644 => 100755 pipeline/setup/install-deps.sh mode change 100644 => 100755 pipeline/setup/install-kenlm.sh mode change 100644 => 100755 pipeline/train/train-student.sh mode change 100644 => 100755 pipeline/train/train.sh mode change 100644 => 100755 pipeline/translate/collect.sh mode change 100644 => 100755 pipeline/translate/merge-corpus.sh mode change 100644 => 100755 pipeline/translate/split-corpus.sh mode change 100644 => 100755 pipeline/translate/split-mono.sh diff --git a/pipeline/alignment/generate-alignment-and-shortlist.sh b/pipeline/alignment/generate-alignment-and-shortlist.sh old mode 100644 new mode 100755 diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh old mode 100644 new mode 100755 index 2371ed26e..de73b5dda --- a/pipeline/bicleaner/bicleaner.sh +++ b/pipeline/bicleaner/bicleaner.sh @@ -23,13 +23,20 @@ type=$4 threads=$5 pack_dir=$6 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + +if [ "$threads" = "auto" ]; then + threads=$(nproc) +fi + output_dir=$(dirname "${output_prefix}") mkdir -p "${output_dir}" if [ "${bicleaner_threshold}" == "0" ]; then echo "Threshold is 0, skipping filtering" - cp "${corpus_prefix}.${SRC}.gz" "${output_prefix}.${SRC}.gz" - cp "${corpus_prefix}.${TRG}.gz" "${output_prefix}.${TRG}.gz" + cp "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}" "${output_prefix}.${SRC}.${ARTIFACT_EXT}" + cp "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}" "${output_prefix}.${TRG}.${ARTIFACT_EXT}" else if [ "${type}" == 'bicleaner-ai' ]; then echo "### Using bicleaner-ai" @@ -69,27 +76,27 @@ else } export -f biclean # {%} is a 1-indexed job slot number from GNU parallel. We use that as the 1-indexed offset in CUDA_VISIBLE_ARRAY - paste <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | + paste <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}") | parallel -j ${#CUDA_VISIBLE_ARRAY[@]} --pipe -k --block 10M biclean "${pack_dir}"/*.yaml {%} | - pigz >"${output_prefix}.scored.gz" + ${COMPRESSION_CMD} >"${output_prefix}.scored.${ARTIFACT_EXT}" else - paste <(pigz -dc "${corpus_prefix}.${SRC}.gz") <(pigz -dc "${corpus_prefix}.${TRG}.gz") | + paste <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${SRC}.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${corpus_prefix}.${TRG}.${ARTIFACT_EXT}") | ${cmd} --scol ${scol} --tcol ${tcol} --processes "${threads}" - - "${pack_dir}"/*.yaml | - pigz >"${output_prefix}.scored.gz" + ${COMPRESSION_CMD} >"${output_prefix}.scored.${ARTIFACT_EXT}" fi echo "### Filtering" - pigz -dc "${output_prefix}.scored.gz" | + ${COMPRESSION_CMD} -dc "${output_prefix}.scored.${ARTIFACT_EXT}" | awk -v threshold=${bicleaner_threshold} -F"\t" '{if ($3>threshold) {print $0}}' | - pigz >"${output_prefix}.best.gz" + ${COMPRESSION_CMD} >"${output_prefix}.best.${ARTIFACT_EXT}" - echo "Lines before filtering: $(pigz -dc "${output_prefix}.scored.gz" | wc -l)" - echo "Lines after filtering: $(pigz -dc "${output_prefix}.best.gz" | wc -l)" + echo "Lines before filtering: $(${COMPRESSION_CMD} -dc "${output_prefix}.scored.${ARTIFACT_EXT}" | wc -l)" + echo "Lines after filtering: $(${COMPRESSION_CMD} -dc "${output_prefix}.best.${ARTIFACT_EXT}" | wc -l)" echo "### Writing output corpus" - pigz -dc "${output_prefix}.best.gz" | - tee >(cut -f1 | pigz >"${output_prefix}.${SRC}.gz") | - cut -f2 | pigz >"${output_prefix}.${TRG}.gz" + ${COMPRESSION_CMD} -dc "${output_prefix}.best.${ARTIFACT_EXT}" | + tee >(cut -f1 | ${COMPRESSION_CMD} >"${output_prefix}.${SRC}.${ARTIFACT_EXT}") | + cut -f2 | ${COMPRESSION_CMD} >"${output_prefix}.${TRG}.${ARTIFACT_EXT}" # do not delete intermediate files to inspect them and tune the threshold fi diff --git a/pipeline/bicleaner/download-pack.sh b/pipeline/bicleaner/download-pack.sh old mode 100644 new mode 100755 diff --git a/pipeline/cefilter/ce-filter.sh b/pipeline/cefilter/ce-filter.sh old mode 100644 new mode 100755 diff --git a/pipeline/cefilter/score.sh b/pipeline/cefilter/score.sh old mode 100644 new mode 100755 diff --git a/pipeline/clean/clean-corpus.sh b/pipeline/clean/clean-corpus.sh index 6216271e2..5846e6b22 100755 --- a/pipeline/clean/clean-corpus.sh +++ b/pipeline/clean/clean-corpus.sh @@ -17,6 +17,12 @@ output_prefix=$2 threads=$3 dataset=$4 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + +if [ "$threads" = "auto" ]; then + threads=$(nproc) +fi cd "$(dirname "${0}")" export PYTHONPATH="tools" @@ -28,24 +34,24 @@ echo "### Cleaning ${input_prefix}" ###################################################################### echo "### Basic preprocessing" for lng in "${SRC}" "${TRG}"; do - test -s "${output_prefix}.${lng}.nrm.gz" || - pigz -dc "${input_prefix}.${lng}.gz" | + test -s "${output_prefix}.${lng}.nrm.${ARTIFACT_EXT}" || + ${COMPRESSION_CMD} -dc "${input_prefix}.${lng}.${ARTIFACT_EXT}" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ "perl tools/deescape-special-chars.perl | perl tools/remove-non-printing-char.perl" | - pigz >"${output_prefix}.${lng}.nrm.gz" + ${COMPRESSION_CMD} >"${output_prefix}.${lng}.nrm.${ARTIFACT_EXT}" done ##################################################################### echo "### Apply monolingual fixes" for lng in $SRC $TRG; do if [[ ! -x fixes/${dataset}.${lng}.sh ]]; then - test -s "${output_prefix}.${lng}.monofix.gz" || - cp "${output_prefix}.${lng}.nrm.gz" "${output_prefix}.${lng}.monofix.gz" + test -s "${output_prefix}.${lng}.monofix.${ARTIFACT_EXT}" || + cp "${output_prefix}.${lng}.nrm.${ARTIFACT_EXT}" "${output_prefix}.${lng}.monofix.${ARTIFACT_EXT}" else - test -s "${output_prefix}.${lng}.monofix.gz" || - pigz -dc "${output_prefix}.${lng}.nrm.gz" \ + test -s "${output_prefix}.${lng}.monofix.${ARTIFACT_EXT}" || + ${COMPRESSION_CMD} -dc "${output_prefix}.${lng}.nrm.${ARTIFACT_EXT}" \ | fixes/"${dataset}"."${lng}".sh \ - | pigz >"${output_prefix}.${lng}.monofix.gz" + | ${COMPRESSION_CMD} >"${output_prefix}.${lng}.monofix.${ARTIFACT_EXT}" fi done @@ -56,52 +62,52 @@ if [[ -x fixes/${dataset}.sh ]]; then else FIX="cat" fi -test -s "${output_prefix}.${SRC}${TRG}.fix.gz" || - paste <(pigz -dc "${output_prefix}.${SRC}.monofix.gz") <(pigz -dc "${output_prefix}.${TRG}.monofix.gz") \ +test -s "${output_prefix}.${SRC}${TRG}.fix.${ARTIFACT_EXT}" || + paste <(${COMPRESSION_CMD} -dc "${output_prefix}.${SRC}.monofix.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${output_prefix}.${TRG}.monofix.${ARTIFACT_EXT}") \ | $FIX \ - | pigz > "${output_prefix}.${SRC}${TRG}.fix.gz" + | ${COMPRESSION_CMD} > "${output_prefix}.${SRC}${TRG}.fix.${ARTIFACT_EXT}" ###################################################################### echo "### Rule-based filtering" -test -s "${output_prefix}.${SRC}${TRG}.rule-based.gz" || - pigz -dc "${output_prefix}.${SRC}${TRG}.fix.gz" | +test -s "${output_prefix}.${SRC}${TRG}.rule-based.${ARTIFACT_EXT}" || + ${COMPRESSION_CMD} -dc "${output_prefix}.${SRC}${TRG}.fix.${ARTIFACT_EXT}" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ "python3 tools/clean_parallel.py -l1 ${SRC} -l2 ${TRG} --debug" \ 2>"${output_prefix}.${SRC}${TRG}.clean.debug.txt" | - pigz >"${output_prefix}.${SRC}${TRG}.rule-based.gz" + ${COMPRESSION_CMD} >"${output_prefix}.${SRC}${TRG}.rule-based.${ARTIFACT_EXT}" ###################################################################### echo "### Language identification" -test -s "${output_prefix}.${SRC}${TRG}.langid.gz" || - pigz -dc "${output_prefix}.${SRC}${TRG}.rule-based.gz" | +test -s "${output_prefix}.${SRC}${TRG}.langid.${ARTIFACT_EXT}" || + ${COMPRESSION_CMD} -dc "${output_prefix}.${SRC}${TRG}.rule-based.${ARTIFACT_EXT}" | # memory intensive parallel --no-notice --pipe -k -j "$(echo "${threads}"/4 | bc)" --block 50M \ "python3 -Wi tools/langid_fasttext.py -f 1 | python3 -Wi tools/langid_fasttext.py -f 1" | grep -P "^${SRC}\t${TRG}\t" | cut -f3,4 | - pigz >"${output_prefix}.${SRC}${TRG}.langid.gz" + ${COMPRESSION_CMD} >"${output_prefix}.${SRC}${TRG}.langid.${ARTIFACT_EXT}" ###################################################################### echo "### Removing leading and repetitive white spaces" -pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | +${COMPRESSION_CMD} -dc "${output_prefix}.${SRC}${TRG}.langid.${ARTIFACT_EXT}" | cut -f1 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output_prefix}.${SRC}.gz" +${COMPRESSION_CMD} >"${output_prefix}.${SRC}.${ARTIFACT_EXT}" -pigz -dc "${output_prefix}.${SRC}${TRG}.langid.gz" | +${COMPRESSION_CMD} -dc "${output_prefix}.${SRC}${TRG}.langid.${ARTIFACT_EXT}" | cut -f2 | sed -e 's/^[[:space:]]*//' | tr -s " " | -pigz >"${output_prefix}.${TRG}.gz" +${COMPRESSION_CMD} >"${output_prefix}.${TRG}.${ARTIFACT_EXT}" -test -s "${output_prefix}.${SRC}.gz" || exit 1 -test -s "${output_prefix}.${TRG}.gz" || exit 1 +test -s "${output_prefix}.${SRC}.${ARTIFACT_EXT}" || exit 1 +test -s "${output_prefix}.${TRG}.${ARTIFACT_EXT}" || exit 1 echo "### Remove input_prefix from intermediate steps" -rm -rf "${output_prefix}".*.nrm.gz "${output_prefix}".*.langid.gz \ - "${output_prefix}".*.rule-based.gz "${output_prefix}".*.*fix.gz +rm -rf "${output_prefix}".*.nrm.${ARTIFACT_EXT} "${output_prefix}".*.langid.${ARTIFACT_EXT} \ + "${output_prefix}".*.rule-based.${ARTIFACT_EXT} "${output_prefix}".*.*fix.${ARTIFACT_EXT} echo "### Clean ${input_prefix} is written to ${output_prefix}" diff --git a/pipeline/clean/clean-mono.sh b/pipeline/clean/clean-mono.sh index fff76a1be..b54ea2938 100755 --- a/pipeline/clean/clean-mono.sh +++ b/pipeline/clean/clean-mono.sh @@ -14,6 +14,9 @@ output_prefix=$3 threads=$4 dataset=$5 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + echo "### Cleaning ${input_prefix}" cd "$(dirname "${0}")" @@ -25,10 +28,10 @@ mkdir -p "${dir}" ###################################################################### echo "### Basic preprocessing" test -s "${output_prefix}.${lang}.nrm.gz" || - pigz -dc "${input_prefix}.${lang}.gz" | + zstdmt -dc "${input_prefix}.${lang}.zst" | parallel --no-notice --pipe -k -j "${threads}" --block 50M \ "perl tools/deescape-special-chars.perl | perl tools/remove-non-printing-char.perl" | - pigz >"${output_prefix}.${lang}.nrm.gz" + zstdmt -c >"${output_prefix}.${lang}.nrm.zst" ##################################################################### echo "### Apply monolingual fixes" diff --git a/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh b/pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh old mode 100644 new mode 100755 diff --git a/pipeline/clean/merge-corpus.sh b/pipeline/clean/merge-corpus.sh old mode 100644 new mode 100755 diff --git a/pipeline/clean/merge-mono.sh b/pipeline/clean/merge-mono.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/download-corpus.sh b/pipeline/data/download-corpus.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/download-mono.sh b/pipeline/data/download-mono.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/importers/corpus/custom-corpus.sh b/pipeline/data/importers/corpus/custom-corpus.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/importers/corpus/flores.sh b/pipeline/data/importers/corpus/flores.sh old mode 100644 new mode 100755 index e66e61ed6..b2d4daabe --- a/pipeline/data/importers/corpus/flores.sh +++ b/pipeline/data/importers/corpus/flores.sh @@ -14,7 +14,10 @@ trg=$2 output_prefix=$3 dataset=$4 -tmp="$(dirname "${output_prefix}")/flores/${dataset}" +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + +tmp="$(mktemp -d)/flores/${dataset}" mkdir -p "${tmp}" wget -O "${tmp}/flores101_dataset.tar.gz" "https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz" @@ -28,7 +31,7 @@ flores_code() { elif [ "${code}" == "zh-Hant" ]; then flores_code="zho_trad" else - flores_code=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${code}', fail_error=True))") + flores_code=$(python3 -c "from mtdata.iso import iso3_code; print(iso3_code('${code}', fail_error=True))") fi echo "${flores_code}" @@ -37,8 +40,8 @@ flores_code() { src_flores=$(flores_code "${src}") trg_flores=$(flores_code "${trg}") -pigz -c "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" > "${output_prefix}.${src}.gz" -pigz -c "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" > "${output_prefix}.${trg}.gz" +${COMPRESSION_CMD} -c "${tmp}/flores101_dataset/${dataset}/${src_flores}.${dataset}" > "${output_prefix}.${src}.${ARTIFACT_EXT}" +${COMPRESSION_CMD} -c "${tmp}/flores101_dataset/${dataset}/${trg_flores}.${dataset}" > "${output_prefix}.${trg}.${ARTIFACT_EXT}" rm -rf "${tmp}" diff --git a/pipeline/data/importers/corpus/mtdata.sh b/pipeline/data/importers/corpus/mtdata.sh old mode 100644 new mode 100755 index 6e4c6ee10..69ce3ea01 --- a/pipeline/data/importers/corpus/mtdata.sh +++ b/pipeline/data/importers/corpus/mtdata.sh @@ -13,16 +13,21 @@ trg=$2 output_prefix=$3 dataset=$4 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + tmp="$(dirname "${output_prefix}")/mtdata/${dataset}" mkdir -p "${tmp}" -src_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${src}', fail_error=True))") -trg_iso=$(python -c "from mtdata.iso import iso3_code; print(iso3_code('${trg}', fail_error=True))") +src_iso=$(python3 -c "from mtdata.iso import iso3_code; print(iso3_code('${src}', fail_error=True))") +trg_iso=$(python3 -c "from mtdata.iso import iso3_code; print(iso3_code('${trg}', fail_error=True))") + +mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${tmp}" -mtdata get -l "${src}-${trg}" -tr "${dataset}" -o "${tmp}" --compress +find "${tmp}" -mv "${tmp}/train-parts/${dataset}.${src_iso}.gz" "${output_prefix}.${src}.gz" -mv "${tmp}/train-parts/${dataset}.${trg_iso}.gz" "${output_prefix}.${trg}.gz" +cat "${tmp}/train-parts/${dataset}.${src_iso}" | ${COMPRESSION_CMD} -c > "${output_prefix}.${src}.${ARTIFACT_EXT}" +cat "${tmp}/train-parts/${dataset}.${trg_iso}" | ${COMPRESSION_CMD} -c > "${output_prefix}.${trg}.${ARTIFACT_EXT}" rm -rf "${tmp}" diff --git a/pipeline/data/importers/corpus/opus.sh b/pipeline/data/importers/corpus/opus.sh old mode 100644 new mode 100755 index 58172a199..e855b113e --- a/pipeline/data/importers/corpus/opus.sh +++ b/pipeline/data/importers/corpus/opus.sh @@ -13,6 +13,9 @@ trg=$2 output_prefix=$3 dataset=$4 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + name=${dataset%%/*} name_and_version="${dataset//[^A-Za-z0-9_- ]/_}" @@ -26,8 +29,8 @@ wget -O "${archive_path}" "https://object.pouta.csc.fi/OPUS-${dataset}/moses/${s unzip -o "${archive_path}" -d "${tmp}" for lang in ${src} ${trg}; do - pigz -c "${tmp}/${name}.${src}-${trg}.${lang}" > "${output_prefix}.${lang}.gz" || - pigz -c "${tmp}/${name}.${trg}-${src}.${lang}" > "${output_prefix}.${lang}.gz" + ${COMPRESSION_CMD} -c "${tmp}/${name}.${src}-${trg}.${lang}" > "${output_prefix}.${lang}.${ARTIFACT_EXT}" || + ${COMPRESSION_CMD} -c "${tmp}/${name}.${trg}-${src}.${lang}" > "${output_prefix}.${lang}.${ARTIFACT_EXT}" done rm -rf "${tmp}" diff --git a/pipeline/data/importers/corpus/sacrebleu.sh b/pipeline/data/importers/corpus/sacrebleu.sh old mode 100644 new mode 100755 index cecacc3bf..3f08f3ec5 --- a/pipeline/data/importers/corpus/sacrebleu.sh +++ b/pipeline/data/importers/corpus/sacrebleu.sh @@ -13,7 +13,10 @@ trg=$2 output_prefix=$3 dataset=$4 -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src | pigz > "${output_prefix}.${src}.gz" -sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref | pigz > "${output_prefix}.${trg}.gz" +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo src | ${COMPRESSION_CMD} -c > "${output_prefix}.${src}.${ARTIFACT_EXT}" +sacrebleu -t "${dataset}" -l "${src}-${trg}" --echo ref | ${COMPRESSION_CMD} -c > "${output_prefix}.${trg}.${ARTIFACT_EXT}" echo "###### Done: Downloading sacrebleu corpus" diff --git a/pipeline/data/importers/mono/commoncrawl.sh b/pipeline/data/importers/mono/commoncrawl.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/importers/mono/custom-mono.sh b/pipeline/data/importers/mono/custom-mono.sh old mode 100644 new mode 100755 diff --git a/pipeline/data/importers/mono/news-crawl.sh b/pipeline/data/importers/mono/news-crawl.sh old mode 100644 new mode 100755 index 39d8dc13b..6935200ed --- a/pipeline/data/importers/mono/news-crawl.sh +++ b/pipeline/data/importers/mono/news-crawl.sh @@ -10,9 +10,12 @@ lang=$1 output_prefix=$2 dataset=$3 +COMPRESSION_CMD="${COMPRESSION_CMD:-pigz}" +ARTIFACT_EXT="${ARTIFACT_EXT:-gz}" + echo "###### Downloading WMT newscrawl monolingual data" -wget -O "${output_prefix}.gz" \ - "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" +curl -L "http://data.statmt.org/news-crawl/${lang}/${dataset}.${lang}.shuffled.deduped.gz" | \ + gunzip | ${COMPRESSION_CMD} -c > "${output_prefix}.${ARTIFACT_EXT}" echo "###### Done: Downloading WMT newscrawl monolingual data" diff --git a/pipeline/data/importers/mono/paracrawl-mono.sh b/pipeline/data/importers/mono/paracrawl-mono.sh old mode 100644 new mode 100755 diff --git a/pipeline/eval/eval-gpu.sh b/pipeline/eval/eval-gpu.sh old mode 100644 new mode 100755 diff --git a/pipeline/eval/eval-quantized.sh b/pipeline/eval/eval-quantized.sh old mode 100644 new mode 100755 diff --git a/pipeline/eval/eval.sh b/pipeline/eval/eval.sh old mode 100644 new mode 100755 diff --git a/pipeline/quantize/export.sh b/pipeline/quantize/export.sh old mode 100644 new mode 100755 diff --git a/pipeline/quantize/quantize.sh b/pipeline/quantize/quantize.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/compile-extract-lex.sh b/pipeline/setup/compile-extract-lex.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/compile-fast-align.sh b/pipeline/setup/compile-fast-align.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/compile-marian.sh b/pipeline/setup/compile-marian.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/compile-preprocess.sh b/pipeline/setup/compile-preprocess.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/install-deps.sh b/pipeline/setup/install-deps.sh old mode 100644 new mode 100755 diff --git a/pipeline/setup/install-kenlm.sh b/pipeline/setup/install-kenlm.sh old mode 100644 new mode 100755 diff --git a/pipeline/train/train-student.sh b/pipeline/train/train-student.sh old mode 100644 new mode 100755 diff --git a/pipeline/train/train.sh b/pipeline/train/train.sh old mode 100644 new mode 100755 diff --git a/pipeline/translate/collect.sh b/pipeline/translate/collect.sh old mode 100644 new mode 100755 diff --git a/pipeline/translate/merge-corpus.sh b/pipeline/translate/merge-corpus.sh old mode 100644 new mode 100755 diff --git a/pipeline/translate/split-corpus.sh b/pipeline/translate/split-corpus.sh old mode 100644 new mode 100755 diff --git a/pipeline/translate/split-mono.sh b/pipeline/translate/split-mono.sh old mode 100644 new mode 100755 From da180ef97f4f8e2a4c68c8dc5b526fccee5579d5 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 14 Apr 2023 10:00:34 -0400 Subject: [PATCH 12/24] Add treeherder symbol for decision task We need this for action tasks to be triggerable through Treeherder, and it's also generally nice to have. --- .taskcluster.yml | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index b1b137a02..c4bc2fc4f 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -121,6 +121,7 @@ tasks: routes: $flatten: - checks + - tc-treeherder.v2.${project}.${head_sha} - $switch: 'tasks_for == "github-push"': - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision" @@ -254,19 +255,35 @@ tasks: # debugging, but they are not useful long-term. expires: {$fromNow: '7 day'} - extra: + extra: + $merge: + - treeherder: $merge: - - $if: 'tasks_for == "action"' + - machine: + platform: gecko-decision + - $if: 'tasks_for == "github-push" || isPullRequest' then: - parent: '${action.taskGroupId}' - action: - name: '${action.name}' - context: - taskGroupId: '${action.taskGroupId}' - taskId: {$eval: 'taskId'} - input: {$eval: 'input'} - clientId: {$eval: 'clientId'} - - $if: 'tasks_for == "cron"' - then: - cron: {$json: {$eval: 'cron'}} - - tasks_for: '${tasks_for}' + symbol: D + else: + $if: 'tasks_for == "action"' + then: + groupName: 'action-callback' + groupSymbol: AC + symbol: "${action.symbol}" + else: + groupSymbol: cron + symbol: "${cron.job_symbol}" + - $if: 'tasks_for == "action"' + then: + parent: '${action.taskGroupId}' + action: + name: '${action.name}' + context: + taskGroupId: '${action.taskGroupId}' + taskId: {$eval: 'taskId'} + input: {$eval: 'input'} + clientId: {$eval: 'clientId'} + - $if: 'tasks_for == "cron"' + then: + cron: {$json: {$eval: 'cron'}} + - tasks_for: '${tasks_for}' From 08fc29de2235f95858def5f06fe858b66dfb5a9b Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 13 Apr 2023 15:20:28 -0400 Subject: [PATCH 13/24] Add a `train` action task to support kicking off the training pipeline This is very rough for now, but it enables us to kick off certain parts of the pipeline. I intend to look into the possibility of using the existing config format (eg: https://github.com/mozilla/firefox-translations-training/blob/main/configs/config.test.yml) as the schema here later, and there's various input checking that needs to be implemented, and other enhancements. --- .taskcluster.yml | 1 - taskcluster/ci/config.yml | 1 + .../translations_taskgraph/__init__.py | 15 +++ .../actions/__init__.py | 0 .../translations_taskgraph/actions/train.py | 107 ++++++++++++++++++ .../translations_taskgraph/target_tasks.py | 10 ++ 6 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 taskcluster/translations_taskgraph/__init__.py create mode 100644 taskcluster/translations_taskgraph/actions/__init__.py create mode 100644 taskcluster/translations_taskgraph/actions/train.py create mode 100644 taskcluster/translations_taskgraph/target_tasks.py diff --git a/.taskcluster.yml b/.taskcluster.yml index c4bc2fc4f..3fa0b02ee 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -220,7 +220,6 @@ tasks: then: > cd /builds/worker/checkouts/src && ln -s /builds/worker/artifacts artifacts && - pip3 install -r requirements/base.txt && ~/.local/bin/taskgraph action-callback else: > cd /builds/worker/checkouts/src && diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 169d52492..fd00dadbc 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -13,6 +13,7 @@ treeherder: "news-crawl": "news-crawl" taskgraph: + register: translations_taskgraph:register cached-task-prefix: "translations.v2.staging-firefox-translations-training" repositories: firefox_translations_training: diff --git a/taskcluster/translations_taskgraph/__init__.py b/taskcluster/translations_taskgraph/__init__.py new file mode 100644 index 000000000..d3b0af449 --- /dev/null +++ b/taskcluster/translations_taskgraph/__init__.py @@ -0,0 +1,15 @@ +from importlib import import_module + + +def register(graph_config): + _import_modules( + [ + "actions.train", + "target_tasks", + ] + ) + + +def _import_modules(modules): + for module in modules: + import_module(".{}".format(module), package=__name__) diff --git a/taskcluster/translations_taskgraph/actions/__init__.py b/taskcluster/translations_taskgraph/actions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py new file mode 100644 index 000000000..0db7156e7 --- /dev/null +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -0,0 +1,107 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from taskgraph.actions.registry import register_callback_action +from taskgraph.decision import taskgraph_decision +from taskgraph.parameters import Parameters + +TRAIN_ON_PROJECTS = ( + "https://github.com/mozilla/firefox-translations-training", + "https://github.com/mozilla-releng/staging-firefox-translations-training", +) + + +def can_train(parameters): + return parameters["head_repository"] in TRAIN_ON_PROJECTS + + +@register_callback_action( + name="train", + title="Train", + symbol="train", + description="Initiate part or all of the training pipeline", + generic=False, + order=500, + context=[], + available=can_train, + # TODO: investigate re-using the exact schema of the existing configs + # for this both to ease the transition to taskcluster, and because they + # have already been well thought out + schema=lambda graph_config: { + "type": "object", + "properties": { + "stage": { + "type": "string", + "description": """The stage of the pipeline to run until +(any stages this choice depends on will be automatically included).""", + "default": "", + # TODO: this should probably be specified in ci/config.yml + "enum": ["clean", "bicleaner", "bicleaner-ai"], + }, + "datasets": { + "type": "array", + "description": "The datasets to train with", + "default": [], + "items": { + "type": "string", + # TODO: pull this from ci/config.yml + "enum": ["flores-dev"], + }, + }, + # TODO: should these be replaced with a single pair? + "src_locale": { + "type": "string", + "description": "The src locale to train", + "default": "", + }, + "trg_locale": { + "type": "string", + "description": "The trg locale to train", + "default": "", + }, + # TODO: lots of reworking here. the default should be by dataset + # we may want to re-use the existing pipeline configs, too + "bicleaner_threshold": { + "type": "string", + "description": "bicleaner threshold", + "default": "1.0", + }, + }, + "required": [ + "stage", + "datasets", + "src_locale", + "trg_locale", + ], + }, +) +def train_action(parameters, graph_config, input, task_group_id, task_id): + stage = input["stage"] + target_datasets = input["datasets"] + src_locale = input.get("src_locale") + trg_locale = input.get("trg_locale") + graph_config["datasets"] + locale_str = f"{src_locale}-{trg_locale}" + + # TODO: Add a whack load of verification here. Things such as: + # - datasets all exist + # - locale pair exists for each dataset + # - stage is valid + # etc. + + parameters = dict(parameters) + + parameters["target_tasks_method"] = "train-target-tasks" + + # When doing staging releases, we still want to re-use tasks from previous + # graphs. + parameters["optimize_target_tasks"] = True + parameters["tasks_for"] = "action" + + # make parameters read-only + parameters["target_task_names"] = [f"{stage}-{d}-{locale_str}" for d in target_datasets] + parameters["bicleaner_threshold"] = input["bicleaner_threshold"] + parameters = Parameters(**parameters) + + taskgraph_decision({"root": graph_config.root_dir}, parameters=parameters) diff --git a/taskcluster/translations_taskgraph/target_tasks.py b/taskcluster/translations_taskgraph/target_tasks.py new file mode 100644 index 000000000..32f399745 --- /dev/null +++ b/taskcluster/translations_taskgraph/target_tasks.py @@ -0,0 +1,10 @@ +from taskgraph.target_tasks import _target_task + + +@_target_task("train-target-tasks") +def train_target_tasks(full_task_graph, parameters, graph_config): + def filter(label): + if label in parameters["target_task_names"]: + return True + + return [label for label in full_task_graph.tasks.keys() if filter(label)] From 119aa6b94c8add1b2571d2c3c8fa6a2dd4dc6d1f Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 19 Apr 2023 14:17:42 -0400 Subject: [PATCH 14/24] Add bicleaner pack fetches These are an additional dependency for the `bicleaner` stage of the pipeline. --- taskcluster/ci/fetch/bicleaner.yml | 273 +++++++++++++++++++++++++++++ taskcluster/ci/fetch/kind.yml | 1 + 2 files changed, 274 insertions(+) create mode 100644 taskcluster/ci/fetch/bicleaner.yml diff --git a/taskcluster/ci/fetch/bicleaner.yml b/taskcluster/ci/fetch/bicleaner.yml new file mode 100644 index 000000000..d6d7b5d99 --- /dev/null +++ b/taskcluster/ci/fetch/bicleaner.yml @@ -0,0 +1,273 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +bicleaner-ai-full-en-xx: + description: bicleaner-ai full en-xx pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-ai-data/releases/download/v2.0/full-en-xx.tgz + sha256: 6429bf2802224e7ae52493a4aef4c0c78fb2b86837f3ebf8a3613432e6227880 + size: 952926795 + artifact-name: bicleaner-full-en-xx.tar.zst + +bicleaner-en-bg: + description: bicleaner en-bg pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-bg.tar.gz + sha256: a0c2cdd27aeef2c12eb216357cbfb5a1b8920f88c469694e515e8ac61ed29d28 + size: 397053400 + artifact-name: bicleaner-en-bg.tar.zst + +bicleaner-en-ca: + description: bicleaner en-ca pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-ca.tar.gz + sha256: 84890e244bc8fd769a906c3ff4280873d73d83700394808655605c4acc39d5f7 + size: 273582123 + artifact-name: bicleaner-en-ca.tar.zst + +bicleaner-en-cs: + description: bicleaner en-cs pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-cs.tar.gz + sha256: 483520b5cf788aad393a06640e1259293018051445e435186ba66f8d8f0d7c3b + size: 359498838 + artifact-name: bicleaner-en-cs.tar.zst + +bicleaner-en-da: + description: bicleaner en-da pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-da.tar.gz + sha256: 5ea4724af3c00581eede1927fe5467d3fa73ec723a9adebdc083c4f0ae29092e + size: 333511920 + artifact-name: bicleaner-en-da.tar.zst + +bicleaner-en-de: + description: bicleaner en-de pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-de.tar.gz + sha256: c932449563ef0108757382d7d2de2dae2bdd97f78cf9dcee3fcf734a80c5b98d + size: 523179851 + artifact-name: bicleaner-en-de.tar.zst + +bicleaner-en-el: + description: bicleaner en-el pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-el.tar.gz + sha256: c71c53407d8f29f0c0bc78c9c5cd5d9c266ca1414eda27affde86a4d792ca7c8 + size: 415029223 + artifact-name: bicleaner-en-el.tar.zst + +bicleaner-en-es: + description: bicleaner en-es pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-es.tar.gz + sha256: 96e5d16d8f30007d203b3c7e362a729f6e5fe60be56abdeccab66b14cc7783c3 + size: 351612333 + artifact-name: bicleaner-en-es.tar.zst + +bicleaner-en-et: + description: bicleaner en-et pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-et.tar.gz + sha256: 0c8c4acc4856d34f5a9d887a5d4eb6572217e757f32f7b6cdc4c2862ec34f443 + size: 434084344 + artifact-name: bicleaner-en-et.tar.zst + +bicleaner-en-fi: + description: bicleaner en-fi pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-fi.tar.gz + sha256: feb8390ec911a888678b86751d6eaeb56b1a916bbea5a0bbdc0ef76428398017 + size: 503190781 + artifact-name: bicleaner-en-fi.tar.zst + +bicleaner-en-fr: + description: bicleaner en-fr pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-fr.tar.gz + sha256: e32134c6e3563a2e2b86e03bc082dea29f99f16f06a1127ed8cda1d4be6ec284 + size: 301237052 + artifact-name: bicleaner-en-fr.tar.zst + +bicleaner-en-ga: + description: bicleaner en-ga pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-ga.tar.gz + sha256: 69f812bcbc1dcc44c16a4386545b33a41fb3e7ed76a3fc2fa7f63433d5088898 + size: 429467347 + artifact-name: bicleaner-en-ga.tar.zst + +bicleaner-en-hr: + description: bicleaner en-hr pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-hr.tar.gz + sha256: e866ab391aecb3f14e9c984f59dc69ab2884a66c2a0cc1c651e734728e499856 + size: 396828015 + artifact-name: bicleaner-en-hr.tar.zst + +bicleaner-en-hu: + description: bicleaner en-hu pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-hu.tar.gz + sha256: 366807757aa5381609b0bfc39ef26741ef8c1e957047c3330958b76e9b392237 + size: 415085287 + artifact-name: bicleaner-en-hu.tar.zst + +bicleaner-en-is: + description: bicleaner en-is pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-is.tar.gz + sha256: c3efb8676198fddab36052d7f606c010ce3c7c659e924f5bfac3753023167f55 + size: 420381350 + artifact-name: bicleaner-en-is.tar.zst + +bicleaner-en-it: + description: bicleaner en-it pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-it.tar.gz + sha256: 2aeb588cd0b931a468aa043ac8285fd89c8aa52cdd6a04d878ce873d1d472fb6 + size: 282449727 + artifact-name: bicleaner-en-it.tar.zst + +bicleaner-en-lt: + description: bicleaner en-lt pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-lt.tar.gz + sha256: 01c64b93362a8140811c66c3ca1be0a35c2deb8404bd63df3cb55ffefeb5858b + size: 433983794 + artifact-name: bicleaner-en-lt.tar.zst + +bicleaner-en-lv: + description: bicleaner en-lv pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-lv.tar.gz + sha256: f92663d90bb12faaa1f8d9fc01c6546d09bacb9036442d112d5af6e57cec0bfd + size: 395646027 + artifact-name: bicleaner-en-lv.tar.zst + +bicleaner-en-mt: + description: bicleaner en-mt pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-mt.tar.gz + sha256: f1f477cc76049a63ebb751ec01bafbe35cd2a7e3f1fa95d516001239fdfde3d2 + size: 446888765 + artifact-name: bicleaner-en-mt.tar.zst + +bicleaner-en-nb: + description: bicleaner en-nb pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-nb.tar.gz + sha256: 8289271d1b92cb78ccee0f90af648fd0e2b1e034bf7ce3555dee3cb9bb7b8561 + size: 477583939 + artifact-name: bicleaner-en-nb.tar.zst + +bicleaner-en-nl: + description: bicleaner en-nl pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-nl.tar.gz + sha256: 3f0885999c9bd4f9a9d1e4d79cc8e1cf5532cf88e46758e1a36ba3be84b8a5ca + size: 555802202 + artifact-name: bicleaner-en-nl.tar.zst + +bicleaner-en-nn: + description: bicleaner en-nn pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-nn.tar.gz + sha256: 8c1ab49c94d9ed9fb093917dd7a80d8bedd047a18f88dbf8a7e81507fad0d73b + size: 324817842 + artifact-name: bicleaner-en-nn.tar.zst + +bicleaner-en-pl: + description: bicleaner en-pl pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-pl.tar.gz + sha256: e2bc5d07cba8a5b59eb0f5583b0c4f1c9f4ead096bcd0186a6f22fdd631c8172 + size: 380483224 + artifact-name: bicleaner-en-pl.tar.zst + +bicleaner-en-pt: + description: bicleaner en-pt pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-pt.tar.gz + sha256: 5d48b074f955bbfa582c6c47bbadc90178b04a392b58af27178389677f1c135e + size: 431671623 + artifact-name: bicleaner-en-pt.tar.zst + +bicleaner-en-ro: + description: bicleaner en-ro pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-ro.tar.gz + sha256: 25048eaf627c3f950e44898d7892b11b06321ee9c4bfe16dab1e1400f1336abc + size: 393528818 + artifact-name: bicleaner-en-ro.tar.zst + +bicleaner-en-ru: + description: bicleaner en-ru pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-ru.tar.gz + sha256: ad445d317665bf81a895dca2c10c4466e76f9d7fbfb32d9a08770ddef34ea42e + size: 508534409 + artifact-name: bicleaner-en-ru.tar.zst + +bicleaner-en-sk: + description: bicleaner en-sk pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-sk.tar.gz + sha256: e0b5cab1198b73ed67d8c72bd9622a6a0362427221d321aa2631c98e04de4e34 + size: 336424929 + artifact-name: bicleaner-en-sk.tar.zst + +bicleaner-en-sl: + description: bicleaner en-sl pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-sl.tar.gz + sha256: 9a365867cc2f10b0f58d18ee14834f1b42525fddf23402ce3ac54bfcf8bd9a5b + size: 400519843 + artifact-name: bicleaner-en-sl.tar.zst + +bicleaner-en-sv: + description: bicleaner en-sv pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-sv.tar.gz + sha256: 773d0f2e32f8399c465966a32647a45289bbd85a53c7c4fafbac189ac3c028b6 + size: 401161427 + artifact-name: bicleaner-en-sv.tar.zst + +bicleaner-en-uk: + description: bicleaner en-uk pack + fetch: + type: static-url + url: https://github.com/bitextor/bicleaner-data/releases/download/v1.6/en-uk.tar.gz + sha256: ab7ef285c3ec5ab0878aa45e5a6b20b60c41dd9c25e695bcb9dca022e6cf2541 + size: 336387092 + artifact-name: bicleaner-en-uk.tar.zst diff --git a/taskcluster/ci/fetch/kind.yml b/taskcluster/ci/fetch/kind.yml index 34789a8d4..0ba97ca3f 100644 --- a/taskcluster/ci/fetch/kind.yml +++ b/taskcluster/ci/fetch/kind.yml @@ -13,5 +13,6 @@ task-defaults: docker-image: {in-tree: base} tasks-from: + - bicleaner.yml - python.yml - toolchains.yml From dabd4dff491b1c04d91d5b125fd1b709f970c57c Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 19 Apr 2023 16:19:10 -0400 Subject: [PATCH 15/24] Implement `bicleaner` pipeline stage Very similar to the `clean` and `dataset` stages that have already been implemented. The notable differences are: - The `bicleaner` tool that eventually gets called has a bunch of Python dependencies. Most of these are handled by the requirements file I'm adding, but there's two extra ones that don't have binary wheels available -- so we're grabbing them from our toolchain builds and using those. (In fact, `kenlm` isn't even declared as a dependency by `bicleaner`...so we'd have to install it by hand one way or another...) - At the moment, this is using a new `split_by_provider` transform that avoids us needing to list out each provider in the kind. This probably needs to go away, because I recently learned that many pipeline steps (such as this one) don't run for all providers. - An enhancement to the cache transform to allow specifying `parameters` that should contribute to the cache digest - A similar enchancement to the substitution transform allow substituting parameters --- .../bicleaner/requirements/bicleaner-ai.in | 1 + .../bicleaner/requirements/bicleaner-ai.txt | 223 ++++++++++++++++++ pipeline/bicleaner/requirements/bicleaner.in | 1 + pipeline/bicleaner/requirements/bicleaner.txt | 86 +++++++ taskcluster/ci/bicleaner/kind.yml | 132 +++++++++++ taskcluster/ci/config.yml | 7 + taskcluster/docker/train/Dockerfile | 1 + .../translations_taskgraph/__init__.py | 1 + .../translations_taskgraph/parameters.py | 23 ++ .../transforms/cache.py | 6 + .../transforms/command_context_from_params.py | 30 +++ .../transforms/split_by_provider.py | 18 ++ 12 files changed, 529 insertions(+) create mode 100644 pipeline/bicleaner/requirements/bicleaner-ai.in create mode 100644 pipeline/bicleaner/requirements/bicleaner-ai.txt create mode 100644 pipeline/bicleaner/requirements/bicleaner.in create mode 100644 pipeline/bicleaner/requirements/bicleaner.txt create mode 100644 taskcluster/ci/bicleaner/kind.yml create mode 100644 taskcluster/translations_taskgraph/parameters.py create mode 100644 taskcluster/translations_taskgraph/transforms/command_context_from_params.py create mode 100644 taskcluster/translations_taskgraph/transforms/split_by_provider.py diff --git a/pipeline/bicleaner/requirements/bicleaner-ai.in b/pipeline/bicleaner/requirements/bicleaner-ai.in new file mode 100644 index 000000000..bc3a0900d --- /dev/null +++ b/pipeline/bicleaner/requirements/bicleaner-ai.in @@ -0,0 +1 @@ +bicleaner-ai==2.0 diff --git a/pipeline/bicleaner/requirements/bicleaner-ai.txt b/pipeline/bicleaner/requirements/bicleaner-ai.txt new file mode 100644 index 000000000..558415cba --- /dev/null +++ b/pipeline/bicleaner/requirements/bicleaner-ai.txt @@ -0,0 +1,223 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile bicleaner-ai.in +# +absl-py==1.4.0 + # via + # tensorboard + # tensorflow +astunparse==1.6.3 + # via tensorflow +bicleaner-ai==2.0 + # via -r bicleaner-ai.in +bicleaner-ai-glove==0.2.1 + # via bicleaner-ai +bicleaner-hardrules==2.7.0 + # via bicleaner-ai +cachetools==5.3.0 + # via google-auth +certifi==2022.12.7 + # via requests +charset-normalizer==3.1.0 + # via requests +click==8.1.3 + # via sacremoses +exceptiongroup==1.1.1 + # via pytest +fastspell==0.5 + # via bicleaner-hardrules +fasttext==0.9.2 + # via + # bicleaner-hardrules + # fastspell +filelock==3.12.0 + # via + # huggingface-hub + # transformers +flatbuffers==23.3.3 + # via tensorflow +fuzzywuzzy==0.18.0 + # via bicleaner-ai +gast==0.4.0 + # via tensorflow +google-auth==2.17.3 + # via + # google-auth-oauthlib + # tensorboard +google-auth-oauthlib==0.4.6 + # via tensorboard +google-pasta==0.2.0 + # via tensorflow +grpcio==1.54.0 + # via + # tensorboard + # tensorflow +h5py==3.8.0 + # via tensorflow +huggingface-hub==0.11.1 + # via + # bicleaner-ai + # transformers +hunspell==0.5.5 + # via fastspell +idna==3.4 + # via requests +iniconfig==2.0.0 + # via pytest +joblib==1.2.0 + # via + # bicleaner-ai + # bicleaner-hardrules + # sacremoses + # scikit-learn +keras==2.11.0 + # via tensorflow +levenshtein==0.20.9 + # via python-levenshtein +libclang==16.0.0 + # via tensorflow +markdown==3.4.3 + # via tensorboard +markupsafe==2.1.2 + # via werkzeug +numpy==1.24.2 + # via + # bicleaner-ai + # bicleaner-ai-glove + # fasttext + # h5py + # opt-einsum + # scikit-learn + # scipy + # tensorboard + # tensorflow + # transformers +oauthlib==3.2.2 + # via requests-oauthlib +opt-einsum==3.3.0 + # via tensorflow +packaging==23.1 + # via + # huggingface-hub + # pytest + # tensorflow + # transformers +pluggy==1.0.0 + # via pytest +protobuf==3.19.6 + # via + # tensorboard + # tensorflow +psutil==5.9.5 + # via bicleaner-ai +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.2.8 + # via google-auth +pybind11==2.10.4 + # via fasttext +pytest==7.3.1 + # via + # bicleaner-ai + # bicleaner-hardrules +python-levenshtein==0.20.9 + # via bicleaner-ai +pyyaml==6.0 + # via + # bicleaner-ai + # bicleaner-hardrules + # fastspell + # huggingface-hub + # transformers +rapidfuzz==2.15.1 + # via levenshtein +regex==2023.3.23 + # via + # bicleaner-ai + # bicleaner-hardrules + # sacremoses + # transformers +requests==2.28.2 + # via + # huggingface-hub + # requests-oauthlib + # tensorboard + # transformers +requests-oauthlib==1.3.1 + # via google-auth-oauthlib +rsa==4.9 + # via google-auth +sacremoses==0.0.53 + # via + # bicleaner-ai + # bicleaner-hardrules + # fastspell +scikit-learn==1.2.2 + # via bicleaner-ai +scipy==1.10.1 + # via + # bicleaner-ai-glove + # scikit-learn +sentencepiece==0.1.98 + # via bicleaner-ai +six==1.16.0 + # via + # astunparse + # google-auth + # google-pasta + # sacremoses + # tensorflow +tensorboard==2.11.2 + # via tensorflow +tensorboard-data-server==0.6.1 + # via tensorboard +tensorboard-plugin-wit==1.8.1 + # via tensorboard +tensorflow==2.11.1 + # via bicleaner-ai +tensorflow-estimator==2.11.0 + # via tensorflow +tensorflow-io-gcs-filesystem==0.32.0 + # via tensorflow +termcolor==2.2.0 + # via tensorflow +threadpoolctl==3.1.0 + # via scikit-learn +tokenizers==0.13.3 + # via transformers +tomli==2.0.1 + # via pytest +toolwrapper==2.1.0 + # via + # bicleaner-ai + # bicleaner-hardrules +tqdm==4.65.0 + # via + # huggingface-hub + # sacremoses + # transformers +transformers==4.26 + # via bicleaner-ai +typing-extensions==4.5.0 + # via + # huggingface-hub + # tensorflow +urllib3==1.26.15 + # via + # fastspell + # requests +werkzeug==2.2.3 + # via tensorboard +wheel==0.40.0 + # via + # astunparse + # tensorboard +wrapt==1.15.0 + # via tensorflow + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/pipeline/bicleaner/requirements/bicleaner.in b/pipeline/bicleaner/requirements/bicleaner.in new file mode 100644 index 000000000..0d66b8934 --- /dev/null +++ b/pipeline/bicleaner/requirements/bicleaner.in @@ -0,0 +1 @@ +bicleaner==0.16 diff --git a/pipeline/bicleaner/requirements/bicleaner.txt b/pipeline/bicleaner/requirements/bicleaner.txt new file mode 100644 index 000000000..b0ec1d115 --- /dev/null +++ b/pipeline/bicleaner/requirements/bicleaner.txt @@ -0,0 +1,86 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile bicleaner.in +# +bicleaner==0.16 + # via -r bicleaner.in +bicleaner-hardrules==2.5.1 + # via bicleaner +click==8.1.3 + # via sacremoses +exceptiongroup==1.1.1 + # via pytest +fastspell==0.4 + # via bicleaner-hardrules +fasttext==0.9.2 + # via + # bicleaner-hardrules + # fastspell +hunspell==0.5.5 + # via fastspell +iniconfig==2.0.0 + # via pytest +joblib==1.2.0 + # via + # bicleaner + # bicleaner-hardrules + # sacremoses + # scikit-learn +numpy==1.24.2 + # via + # bicleaner + # fasttext + # scikit-learn + # scipy +packaging==23.1 + # via pytest +pluggy==1.0.0 + # via pytest +pybind11==2.10.4 + # via fasttext +pycld2==0.41 + # via bicleaner +pytest==7.3.1 + # via + # bicleaner + # bicleaner-hardrules +pyyaml==6.0 + # via + # bicleaner + # bicleaner-hardrules + # fastspell +regex==2023.3.23 + # via + # bicleaner + # bicleaner-hardrules + # sacremoses +sacremoses==0.0.53 + # via + # bicleaner + # bicleaner-hardrules + # fastspell +scikit-learn==1.1.3 + # via bicleaner +scipy==1.10.1 + # via + # bicleaner + # scikit-learn +six==1.16.0 + # via sacremoses +threadpoolctl==3.1.0 + # via scikit-learn +tomli==2.0.1 + # via pytest +toolwrapper==2.1.0 + # via + # bicleaner + # bicleaner-hardrules +tqdm==4.65.0 + # via sacremoses +urllib3==1.26.15 + # via fastspell + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/taskcluster/ci/bicleaner/kind.yml b/taskcluster/ci/bicleaner/kind.yml new file mode 100644 index 000000000..feff6bd3e --- /dev/null +++ b/taskcluster/ci/bicleaner/kind.yml @@ -0,0 +1,132 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +--- +# TODO: this may not be sude for all locale pairs? or not for all dataset types? +# TODO: this this run on large instances? gpu? + +loader: taskgraph.loader.transform:loader + +transforms: + - translations_taskgraph.transforms.split_by_provider:transforms + - translations_taskgraph.transforms.dataset_substitutions:transforms + - translations_taskgraph.transforms.command_context_from_params:transforms + - taskgraph.transforms.job:transforms + - translations_taskgraph.transforms.cache:transforms + - taskgraph.transforms.cached_tasks:transforms + - taskgraph.transforms.task:transforms + +kind-dependencies: + - clean + - fetch + - toolchain + +task-defaults: + attributes: + cache-resources: + - pipeline/bicleaner/bicleaner.sh + cache-parameters: + - bicleaner_threshold + substitution-fields: + - description + - name + - dependencies + - fetches + - treeherder.symbol + - worker.env + worker: + max-run-time: 3600 + artifacts: + - name: public/build + path: /builds/worker/artifacts + type: directory + env: + SRC: "{src_locale}" + TRG: "{trg_locale}" + # It would be preferable to use $MOZ_FETCHES_DIR here, but these don't + # get interpreted. + CUDA_DIR: /builds/worker/fetches/cuda-toolkit + CUDNN_DIR: /builds/worker/fetches/cuda-toolkit + + # Don't run unless explicitly scheduled + run-on-tasks-for: [] + + treeherder: + symbol: "{provider}({dataset_short}-{src_locale}-{trg_locale})" + run: + using: run-task + command-context: + from-parameters: + - bicleaner_threshold + command: + - bash + - -c + # We can't inline comments for the args to `bicleaner.sh`, so they're explained + # here instead: + # 1) prefix for input data + # 2) prefix for output data + # 3) bicleaner threshold + # 4) bicleaner type + # 5) number of threads to use - auto means nproc + # 6) "pack dir" - which needs to be where the `bicleaner-src-trg` fetch was unpacked to + - >- + pip install $MOZ_FETCHES_DIR/hunspell-0.5.5-cp310-cp310-linux_x86_64.whl && + pip install $MOZ_FETCHES_DIR/kenlm-0.0.0-cp310-cp310-linux_x86_64.whl && + pip install -r {bicleaner_reqs} && + $VCS_PATH/pipeline/bicleaner/bicleaner.sh + $MOZ_FETCHES_DIR/{dataset_no_slashes} + /builds/worker/artifacts/{dataset_no_slashes} + {bicleaner_threshold} + {bicleaner_type} + {bicleaner_threads} + $MOZ_FETCHES_DIR/{src_locale}-{trg_locale} + dependencies: + "{provider}": clean-{provider}-{dataset_no_slashes}-{src_locale}-{trg_locale} + fetches: + fetch: + - bicleaner-{src_locale}-{trg_locale} + toolchain: + - hunspell + - kenlm + - cuda-toolkit + "{provider}": + - artifact: "{dataset_no_slashes}.{src_locale}.zst" + extract: false + - artifact: "{dataset_no_slashes}.{trg_locale}.zst" + extract: false + +tasks: + "{provider}-{dataset}-{src_locale}-{trg_locale}": + description: bicleaner for {provider} {dataset} dataset {src_locale}-{trg_locale} + worker-type: b-linux + worker: + docker-image: {"in-tree": "train"} + treeherder: + platform: bicleaner/opt + attributes: + cache-type: bicleaner + cache-resources: + - pipeline/bicleaner/requirements/bicleaner.txt + run: + command-context: + bicleaner_type: bicleaner + bicleaner_reqs: $VCS_PATH/pipeline/bicleaner/requirements/bicleaner.txt + # auto = use `nproc` value + bicleaner_threads: auto + + ai-{provider}-{dataset}-{src_locale}-{trg_locale}: + description: bicleaner-ai for {provider} {dataset} dataset {src_locale}-{trg_locale} + worker-type: t-linux-v100-gpu + treeherder: + platform: bicleaner-ai/opt + attributes: + cache-type: bicleaner-ai + cache-resources: + - pipeline/bicleaner/requirements/bicleaner-ai.txt + run: + command-context: + bicleaner_type: bicleaner-ai + bicleaner_reqs: $VCS_PATH/pipeline/bicleaner/requirements/bicleaner-ai.txt + # TODO: set this to a sensible value based on number of GPUs? + # or maybe it should also be `auto`? + bicleaner_threads: auto diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index fd00dadbc..08bb36e8a 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -15,6 +15,7 @@ treeherder: taskgraph: register: translations_taskgraph:register cached-task-prefix: "translations.v2.staging-firefox-translations-training" + decision-parameters: "translations_taskgraph.parameters:get_decision_parameters" repositories: firefox_translations_training: name: "staging-firefox-translations-training" @@ -78,6 +79,12 @@ workers: implementation: docker-worker os: linux worker-type: '{alias}-gcp' + # TODO: this should probably be b-linux...need to update ci-config + t-linux-v100-gpu: + provisioner: '{trust-domain}-{level}' + implementation: generic-worker + os: linux + worker-type: '{alias}' images: provisioner: '{trust-domain}-{level}' implementation: docker-worker diff --git a/taskcluster/docker/train/Dockerfile b/taskcluster/docker/train/Dockerfile index 58588d800..505be1df8 100644 --- a/taskcluster/docker/train/Dockerfile +++ b/taskcluster/docker/train/Dockerfile @@ -7,6 +7,7 @@ RUN apt-get update -qq \ parallel \ zstd \ bc \ + libhunspell-1.7-0 \ && apt-get clean # Required to download sacrebleu datasets diff --git a/taskcluster/translations_taskgraph/__init__.py b/taskcluster/translations_taskgraph/__init__.py index d3b0af449..c90682666 100644 --- a/taskcluster/translations_taskgraph/__init__.py +++ b/taskcluster/translations_taskgraph/__init__.py @@ -5,6 +5,7 @@ def register(graph_config): _import_modules( [ "actions.train", + "parameters", "target_tasks", ] ) diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py new file mode 100644 index 000000000..2b56aa899 --- /dev/null +++ b/taskcluster/translations_taskgraph/parameters.py @@ -0,0 +1,23 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from taskgraph.parameters import extend_parameters_schema +from voluptuous import Optional + +def get_defaults(repo_root): + return { + "bicleaner_threshold": "0.0", + } + +extend_parameters_schema( + { + Optional("bicleaner_threshold"): str, + }, + defaults_fn=get_defaults, +) + +def get_decision_parameters(graph_config, parameters): + for k, v in get_defaults("").items(): + if k not in parameters: + parameters[k] = v diff --git a/taskcluster/translations_taskgraph/transforms/cache.py b/taskcluster/translations_taskgraph/transforms/cache.py index aaed198b6..1de81972c 100644 --- a/taskcluster/translations_taskgraph/transforms/cache.py +++ b/taskcluster/translations_taskgraph/transforms/cache.py @@ -8,12 +8,18 @@ def add_cache(config, jobs): for job in jobs: cache_type = job["attributes"]["cache-type"] cache_resources = job["attributes"]["cache-resources"] + cache_parameters = job["attributes"].get("cache-parameters", {}) digest_data = [] if cache_resources: for r in cache_resources: digest_data.append(hash_path(r)) + if cache_parameters: + for p in cache_parameters: + # TODO: this should somehow find the default value for each paramater... + digest_data.append(config.params.get(p, "")) + job["cache"] = { "type": cache_type, "name": job["label"], diff --git a/taskcluster/translations_taskgraph/transforms/command_context_from_params.py b/taskcluster/translations_taskgraph/transforms/command_context_from_params.py new file mode 100644 index 000000000..a1b5ae521 --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/command_context_from_params.py @@ -0,0 +1,30 @@ +import copy + +from taskgraph.transforms.base import TransformSequence +from taskgraph.util.schema import Schema +from voluptuous import ALLOW_EXTRA, Required + +SCHEMA = Schema( + { + Required("run"): { + Required("command-context"): { + Required("from-parameters"): [str], + }, + }, + }, + extra=ALLOW_EXTRA, +) + +transforms = TransformSequence() +transforms.add_validate(SCHEMA) + + +@transforms.add +def render_command(config, jobs): + for job in jobs: + subjob = copy.deepcopy(job) + + for param in job["run"]["command-context"]["from-parameters"]: + subjob["run"]["command-context"][param] = config.params[param] + + yield subjob diff --git a/taskcluster/translations_taskgraph/transforms/split_by_provider.py b/taskcluster/translations_taskgraph/transforms/split_by_provider.py new file mode 100644 index 000000000..640fdc3c3 --- /dev/null +++ b/taskcluster/translations_taskgraph/transforms/split_by_provider.py @@ -0,0 +1,18 @@ +import copy + +from taskgraph.transforms.base import TransformSequence + +transforms = TransformSequence() + + +@transforms.add +def split_by_provider(config, jobs): + for job in jobs: + for provider in config.graph_config["datasets"]: + subjob = copy.deepcopy(job) + subjob["provider"] = provider + if "{provider}" not in subjob["name"]: + raise Exception(f"Cannot find {{provider}} substitution in {subjob['name']}; aborting") + + subjob["name"] = subjob["name"].replace("{provider}", provider) + yield subjob From ba43205a2fc4d2b85199db41f0f831d63d98662f Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 26 Apr 2023 10:58:12 -0400 Subject: [PATCH 16/24] Raise taskgraph level for pushes, cron, and actions to level 3 Most of this diff is just indentation changes. --- .taskcluster.yml | 401 ++++++++++++++++++++++++----------------------- 1 file changed, 203 insertions(+), 198 deletions(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index 3fa0b02ee..7d3e1fd3c 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -11,7 +11,6 @@ policy: tasks: - $let: trustDomain: "translations" - level: "1" ownerEmail: $switch: 'tasks_for == "github-push"': '${event.pusher.email}' @@ -75,214 +74,220 @@ tasks: || (tasks_for == "github-push" && head_branch == "refs/heads/main") || (isPullRequest && pullRequestAction in ["opened", "reopened", "synchronize"]) then: - taskId: {$if: 'tasks_for != "action"', then: '${ownTaskId}'} - taskGroupId: - $if: 'tasks_for == "action"' - then: - '${action.taskGroupId}' - else: - '${ownTaskId}' # same as taskId; this is how automation identifies a decision task - schedulerId: '${trustDomain}-level-${level}' - created: {$fromNow: ''} - deadline: {$fromNow: '1 day'} - expires: {$fromNow: '1 year 1 second'} # 1 second so artifacts expire first - metadata: - $merge: - - owner: "${ownerEmail}" - source: "${repoUrl}/raw/${head_sha}/.taskcluster.yml" - - $switch: - 'tasks_for == "github-push" || isPullRequest': - name: "Decision Task" - description: 'The task that creates all of the other tasks in the task graph' - 'tasks_for == "action"': - name: "Action: ${action.title}" - description: | - ${action.description} + $let: + level: + $if: 'tasks_for in ["github-push", "cron", "action"] && repoUrl == "https://github.com/mozilla/firefox-translations-training"' + then: 3 + else: 1 + in: + taskId: {$if: 'tasks_for != "action"', then: '${ownTaskId}'} + taskGroupId: + $if: 'tasks_for == "action"' + then: + '${action.taskGroupId}' + else: + '${ownTaskId}' # same as taskId; this is how automation identifies a decision task + schedulerId: '${trustDomain}-level-${level}' + created: {$fromNow: ''} + deadline: {$fromNow: '1 day'} + expires: {$fromNow: '1 year 1 second'} # 1 second so artifacts expire first + metadata: + $merge: + - owner: "${ownerEmail}" + source: "${repoUrl}/raw/${head_sha}/.taskcluster.yml" + - $switch: + 'tasks_for == "github-push" || isPullRequest': + name: "Decision Task" + description: 'The task that creates all of the other tasks in the task graph' + 'tasks_for == "action"': + name: "Action: ${action.title}" + description: | + ${action.description} - Action triggered by clientID `${clientId}` - $default: - name: "Decision Task for cron job ${cron.job_name}" - description: 'Created by a [cron task](https://firefox-ci-tc.services.mozilla.com/tasks/${cron.task_id})' + Action triggered by clientID `${clientId}` + $default: + name: "Decision Task for cron job ${cron.job_name}" + description: 'Created by a [cron task](https://firefox-ci-tc.services.mozilla.com/tasks/${cron.task_id})' - provisionerId: "${trustDomain}-${level}" - workerType: "decision-gcp" + provisionerId: "${trustDomain}-${level}" + workerType: "decision-gcp" - tags: - $switch: - 'tasks_for == "github-push" || isPullRequest': - createdForUser: "${ownerEmail}" - kind: decision-task - 'tasks_for == "action"': - createdForUser: '${ownerEmail}' - kind: 'action-callback' - 'tasks_for == "cron"': - kind: cron-task + tags: + $switch: + 'tasks_for == "github-push" || isPullRequest': + createdForUser: "${ownerEmail}" + kind: decision-task + 'tasks_for == "action"': + createdForUser: '${ownerEmail}' + kind: 'action-callback' + 'tasks_for == "cron"': + kind: cron-task - routes: - $flatten: - - checks - - tc-treeherder.v2.${project}.${head_sha} - - $switch: - 'tasks_for == "github-push"': - - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision" - - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.decision" - 'tasks_for == "action"': - - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.actions.${ownTaskId}" - 'tasks_for == "cron"': - - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision-${cron.job_name}" - - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.decision-${cron.job_name}" - # list each cron task on this revision, so actions can find them - - 'index.${trustDomain}.v2.${project}.revision.${head_sha}.cron.${ownTaskId}' - $default: [] + routes: + $flatten: + - checks + - tc-treeherder.v2.${project}.${head_sha} + - $switch: + 'tasks_for == "github-push"': + - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision" + - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.decision" + 'tasks_for == "action"': + - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.actions.${ownTaskId}" + 'tasks_for == "cron"': + - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision-${cron.job_name}" + - "index.${trustDomain}.v2.${project}.revision.${head_sha}.taskgraph.decision-${cron.job_name}" + # list each cron task on this revision, so actions can find them + - 'index.${trustDomain}.v2.${project}.revision.${head_sha}.cron.${ownTaskId}' + $default: [] - scopes: - $switch: - 'tasks_for in ["github-push"]': - $let: - short_head_ref: - $if: 'head_ref[:10] == "refs/tags/"' - then: {$eval: 'head_ref[10:]'} - else: - $if: 'head_ref[:11] == "refs/heads/"' - then: {$eval: 'head_ref[11:]'} - else: ${head_ref} - in: - - 'assume:repo:${repoUrl[8:]}:branch:${short_head_ref}' - 'isPullRequest': - - 'assume:repo:github.com/${event.pull_request.base.repo.full_name}:${tasks_for[7:]}' - 'tasks_for == "action"': - - 'assume:repo:${repoUrl[8:]}:action:${action.action_perm}' - $default: - - 'assume:repo:${repoUrl[8:]}:cron:${cron.job_name}' + scopes: + $switch: + 'tasks_for in ["github-push"]': + $let: + short_head_ref: + $if: 'head_ref[:10] == "refs/tags/"' + then: {$eval: 'head_ref[10:]'} + else: + $if: 'head_ref[:11] == "refs/heads/"' + then: {$eval: 'head_ref[11:]'} + else: ${head_ref} + in: + - 'assume:repo:${repoUrl[8:]}:branch:${short_head_ref}' + 'isPullRequest': + - 'assume:repo:github.com/${event.pull_request.base.repo.full_name}:${tasks_for[7:]}' + 'tasks_for == "action"': + - 'assume:repo:${repoUrl[8:]}:action:${action.action_perm}' + $default: + - 'assume:repo:${repoUrl[8:]}:cron:${cron.job_name}' - dependencies: [] - requires: all-completed + dependencies: [] + requires: all-completed - priority: - $switch: - 'tasks_for == "cron"': low - 'tasks_for == "github-push"|| isPullRequest': very-low - $default: lowest # tasks_for == 'action' - retries: 5 + priority: + $switch: + 'tasks_for == "cron"': low + 'tasks_for == "github-push"|| isPullRequest': very-low + $default: lowest # tasks_for == 'action' + retries: 5 - payload: - $let: - normProject: - $eval: 'join(split(project, "-"), "_")' - normProjectUpper: - $eval: 'uppercase(join(split(project, "-"), "_"))' - in: - env: - # run-task uses these to check out the source; the inputs to - # `taskgraph decision` are all on the command line. - $merge: - - ${normProjectUpper}_BASE_REPOSITORY: '${baseRepoUrl}' - ${normProjectUpper}_BASE_REF: '${base_ref}' - ${normProjectUpper}_BASE_REV: '${base_sha}' - ${normProjectUpper}_HEAD_REPOSITORY: '${repoUrl}' - ${normProjectUpper}_HEAD_REF: '${head_ref}' - ${normProjectUpper}_HEAD_REV: '${head_sha}' - ${normProjectUpper}_REPOSITORY_TYPE: git - ${normProjectUpper}_PIP_REQUIREMENTS: taskcluster/requirements.txt - PIP_DISABLE_REQUIRE_HASHES: "1" - REPOSITORIES: - $json: - ${normProject}: ${normProject} - - $if: 'isPullRequest' - then: - ${normProjectUpper}_PULL_REQUEST_NUMBER: '${event.pull_request.number}' - - $if: 'tasks_for == "action"' - then: - ACTION_TASK_GROUP_ID: '${action.taskGroupId}' # taskGroupId of the target task - ACTION_TASK_ID: {$json: {$eval: 'taskId'}} # taskId of the target task (JSON-encoded) - ACTION_INPUT: {$json: {$eval: 'input'}} - ACTION_CALLBACK: '${action.cb_name}' + payload: + $let: + normProject: + $eval: 'join(split(project, "-"), "_")' + normProjectUpper: + $eval: 'uppercase(join(split(project, "-"), "_"))' + in: + env: + # run-task uses these to check out the source; the inputs to + # `taskgraph decision` are all on the command line. + $merge: + - ${normProjectUpper}_BASE_REPOSITORY: '${baseRepoUrl}' + ${normProjectUpper}_BASE_REF: '${base_ref}' + ${normProjectUpper}_BASE_REV: '${base_sha}' + ${normProjectUpper}_HEAD_REPOSITORY: '${repoUrl}' + ${normProjectUpper}_HEAD_REF: '${head_ref}' + ${normProjectUpper}_HEAD_REV: '${head_sha}' + ${normProjectUpper}_REPOSITORY_TYPE: git + ${normProjectUpper}_PIP_REQUIREMENTS: taskcluster/requirements.txt + PIP_DISABLE_REQUIRE_HASHES: "1" + REPOSITORIES: + $json: + ${normProject}: ${normProject} + - $if: 'isPullRequest' + then: + ${normProjectUpper}_PULL_REQUEST_NUMBER: '${event.pull_request.number}' + - $if: 'tasks_for == "action"' + then: + ACTION_TASK_GROUP_ID: '${action.taskGroupId}' # taskGroupId of the target task + ACTION_TASK_ID: {$json: {$eval: 'taskId'}} # taskId of the target task (JSON-encoded) + ACTION_INPUT: {$json: {$eval: 'input'}} + ACTION_CALLBACK: '${action.cb_name}' - cache: - "${trustDomain}-level-${level}-checkouts-sparse-v2": /builds/worker/checkouts + cache: + "${trustDomain}-level-${level}-checkouts-sparse-v2": /builds/worker/checkouts - features: - taskclusterProxy: true + features: + taskclusterProxy: true - image: mozillareleases/taskgraph:decision-5483484ad45a3d27a0f5bd05f1c87d90e08df67a3713605d812b851a8a5bd854@sha256:ef132cc5741539f846a85bbe0cebc3c9ead30b8f24c1da46c55363f2170c3993 - maxRunTime: 1800 + image: mozillareleases/taskgraph:decision-5483484ad45a3d27a0f5bd05f1c87d90e08df67a3713605d812b851a8a5bd854@sha256:ef132cc5741539f846a85bbe0cebc3c9ead30b8f24c1da46c55363f2170c3993 + maxRunTime: 1800 - command: - - run-task - - '--${normProject}-checkout=/builds/worker/checkouts/src' - - '--' - - bash - - -cx - - $let: - extraArgs: {$if: 'tasks_for == "cron"', then: '${cron.quoted_args}', else: ''} - in: - $if: 'tasks_for == "action"' - then: > - cd /builds/worker/checkouts/src && - ln -s /builds/worker/artifacts artifacts && - ~/.local/bin/taskgraph action-callback - else: > - cd /builds/worker/checkouts/src && - ln -s /builds/worker/artifacts artifacts && - ~/.local/bin/taskgraph decision - --pushlog-id='0' - --pushdate='0' - --project='${project}' - --owner='${ownerEmail}' - --level='${level}' - --repository-type=git - --tasks-for='${tasks_for}' - --base-repository='${baseRepoUrl}' - --base-ref='${base_ref}' - --base-rev='${base_sha}' - --head-repository='${repoUrl}' - --head-ref='${head_ref}' - --head-rev='${head_sha}' - ${extraArgs} + command: + - run-task + - '--${normProject}-checkout=/builds/worker/checkouts/src' + - '--' + - bash + - -cx + - $let: + extraArgs: {$if: 'tasks_for == "cron"', then: '${cron.quoted_args}', else: ''} + in: + $if: 'tasks_for == "action"' + then: > + cd /builds/worker/checkouts/src && + ln -s /builds/worker/artifacts artifacts && + ~/.local/bin/taskgraph action-callback + else: > + cd /builds/worker/checkouts/src && + ln -s /builds/worker/artifacts artifacts && + ~/.local/bin/taskgraph decision + --pushlog-id='0' + --pushdate='0' + --project='${project}' + --owner='${ownerEmail}' + --level='${level}' + --repository-type=git + --tasks-for='${tasks_for}' + --base-repository='${baseRepoUrl}' + --base-ref='${base_ref}' + --base-rev='${base_sha}' + --head-repository='${repoUrl}' + --head-ref='${head_ref}' + --head-rev='${head_sha}' + ${extraArgs} - artifacts: - 'public': - type: 'directory' - path: '/builds/worker/artifacts' - expires: {$fromNow: '1 year'} - 'public/docker-contexts': - type: 'directory' - path: '/builds/worker/checkouts/src/docker-contexts' - # This needs to be at least the deadline of the - # decision task + the docker-image task deadlines. - # It is set to a week to allow for some time for - # debugging, but they are not useful long-term. - expires: {$fromNow: '7 day'} + artifacts: + 'public': + type: 'directory' + path: '/builds/worker/artifacts' + expires: {$fromNow: '1 year'} + 'public/docker-contexts': + type: 'directory' + path: '/builds/worker/checkouts/src/docker-contexts' + # This needs to be at least the deadline of the + # decision task + the docker-image task deadlines. + # It is set to a week to allow for some time for + # debugging, but they are not useful long-term. + expires: {$fromNow: '7 day'} - extra: - $merge: - - treeherder: - $merge: - - machine: - platform: gecko-decision - - $if: 'tasks_for == "github-push" || isPullRequest' - then: - symbol: D - else: - $if: 'tasks_for == "action"' - then: - groupName: 'action-callback' - groupSymbol: AC - symbol: "${action.symbol}" - else: - groupSymbol: cron - symbol: "${cron.job_symbol}" - - $if: 'tasks_for == "action"' - then: - parent: '${action.taskGroupId}' - action: - name: '${action.name}' - context: - taskGroupId: '${action.taskGroupId}' - taskId: {$eval: 'taskId'} - input: {$eval: 'input'} - clientId: {$eval: 'clientId'} - - $if: 'tasks_for == "cron"' - then: - cron: {$json: {$eval: 'cron'}} - - tasks_for: '${tasks_for}' + extra: + $merge: + - treeherder: + $merge: + - machine: + platform: gecko-decision + - $if: 'tasks_for == "github-push" || isPullRequest' + then: + symbol: D + else: + $if: 'tasks_for == "action"' + then: + groupName: 'action-callback' + groupSymbol: AC + symbol: "${action.symbol}" + else: + groupSymbol: cron + symbol: "${cron.job_symbol}" + - $if: 'tasks_for == "action"' + then: + parent: '${action.taskGroupId}' + action: + name: '${action.name}' + context: + taskGroupId: '${action.taskGroupId}' + taskId: {$eval: 'taskId'} + input: {$eval: 'input'} + clientId: {$eval: 'clientId'} + - $if: 'tasks_for == "cron"' + then: + cron: {$json: {$eval: 'cron'}} + - tasks_for: '${tasks_for}' From 49b2733551cc15f3dc56bd70b26b540de65f3f12 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Mon, 1 May 2023 15:01:04 -0400 Subject: [PATCH 17/24] Re-adjust ci-config.yml for production repository --- taskcluster/ci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index 08bb36e8a..c193efc0d 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -14,11 +14,11 @@ treeherder: taskgraph: register: translations_taskgraph:register - cached-task-prefix: "translations.v2.staging-firefox-translations-training" + cached-task-prefix: "translations.v2.firefox-translations-training" decision-parameters: "translations_taskgraph.parameters:get_decision_parameters" repositories: firefox_translations_training: - name: "staging-firefox-translations-training" + name: "firefox-translations-training" # It's not exactly _ideal_ to have all of the locale pairs for each dataset # specified in this file, but it's very difficult (if not impossible) to From c87d3dcde420052693a923acca5c457cf88ca07a Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Mon, 1 May 2023 15:09:13 -0400 Subject: [PATCH 18/24] Don't set treeherder routes for pull requests --- .taskcluster.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index 7d3e1fd3c..168719783 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -126,7 +126,9 @@ tasks: routes: $flatten: - checks - - tc-treeherder.v2.${project}.${head_sha} + - $if: 'level == "3"' + then: + - tc-treeherder.v2.${project}.${head_sha} - $switch: 'tasks_for == "github-push"': - "index.${trustDomain}.v2.${project}.latest.taskgraph.decision" From 06a0ae5079f8087f44e0bb21cd8ceebb8943ba58 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Mon, 1 May 2023 17:26:54 -0400 Subject: [PATCH 19/24] Use standard cache prefixes --- taskcluster/ci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/taskcluster/ci/config.yml b/taskcluster/ci/config.yml index c193efc0d..5e26808e7 100644 --- a/taskcluster/ci/config.yml +++ b/taskcluster/ci/config.yml @@ -14,7 +14,6 @@ treeherder: taskgraph: register: translations_taskgraph:register - cached-task-prefix: "translations.v2.firefox-translations-training" decision-parameters: "translations_taskgraph.parameters:get_decision_parameters" repositories: firefox_translations_training: From 3a7950a249789aa0f06529c3293b4297a4652aed Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Mon, 1 May 2023 19:03:34 -0400 Subject: [PATCH 20/24] Add CODEOWNERS file to suggest RelEng as a reviewer for taskcluster changes --- .github/CODEOWNERS | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..26f6f4418 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# Taskcluster pipeline related files. Changes to these ought to be reviewed by +# RelEng to watch for security issues and best practices. These should also +# be reviewed by people familiar with the pipeline itself. +.taskcluster.yml @mozilla/releng +taskcluster @mozilla/releng From d38e24751c520c7761edc0303c78482fa782d7da Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Tue, 2 May 2023 19:26:29 -0400 Subject: [PATCH 21/24] Bump taskgraph version; re-enable pip hash checking --- .taskcluster.yml | 1 - taskcluster/requirements.in | 2 +- taskcluster/requirements.txt | 293 +++++++++++++++++++++++++++++++---- 3 files changed, 263 insertions(+), 33 deletions(-) diff --git a/.taskcluster.yml b/.taskcluster.yml index 168719783..94fb279bd 100644 --- a/.taskcluster.yml +++ b/.taskcluster.yml @@ -191,7 +191,6 @@ tasks: ${normProjectUpper}_HEAD_REV: '${head_sha}' ${normProjectUpper}_REPOSITORY_TYPE: git ${normProjectUpper}_PIP_REQUIREMENTS: taskcluster/requirements.txt - PIP_DISABLE_REQUIRE_HASHES: "1" REPOSITORIES: $json: ${normProject}: ${normProject} diff --git a/taskcluster/requirements.in b/taskcluster/requirements.in index 94b80a7cb..9e939e7e6 100644 --- a/taskcluster/requirements.in +++ b/taskcluster/requirements.in @@ -1 +1 @@ -git+https://github.com/bhearsum/taskgraph@fetch-multiple-artifacts +taskcluster-taskgraph>=5.1.0 diff --git a/taskcluster/requirements.txt b/taskcluster/requirements.txt index 1ad679ac4..5b427b272 100644 --- a/taskcluster/requirements.txt +++ b/taskcluster/requirements.txt @@ -1,71 +1,302 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile requirements.in +# pip-compile --generate-hashes requirements.in # -appdirs==1.4.4 +appdirs==1.4.4 \ + --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \ + --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 # via taskcluster-taskgraph -arrow==1.2.3 +arrow==1.2.3 \ + --hash=sha256:3934b30ca1b9f292376d9db15b19446088d12ec58629bc3f0da28fd55fb633a1 \ + --hash=sha256:5a49ab92e3b7b71d96cd6bfcc4df14efefc9dfa96ea19045815914a6ab6b1fe2 # via jinja2-time -attrs==22.2.0 +attrs==22.2.0 \ + --hash=sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836 \ + --hash=sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99 # via taskcluster-taskgraph -binaryornot==0.4.4 +binaryornot==0.4.4 \ + --hash=sha256:359501dfc9d40632edc9fac890e19542db1a287bbcfa58175b66658392018061 \ + --hash=sha256:b8b71173c917bddcd2c16070412e369c3ed7f0528926f70cac18a6c97fd563e4 # via cookiecutter -certifi==2022.12.7 +certifi==2022.12.7 \ + --hash=sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3 \ + --hash=sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18 # via requests -chardet==5.1.0 +chardet==5.1.0 \ + --hash=sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5 \ + --hash=sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9 # via binaryornot -charset-normalizer==3.0.1 +charset-normalizer==3.0.1 \ + --hash=sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b \ + --hash=sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42 \ + --hash=sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d \ + --hash=sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b \ + --hash=sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a \ + --hash=sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59 \ + --hash=sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154 \ + --hash=sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1 \ + --hash=sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c \ + --hash=sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a \ + --hash=sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d \ + --hash=sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6 \ + --hash=sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b \ + --hash=sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b \ + --hash=sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783 \ + --hash=sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5 \ + --hash=sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918 \ + --hash=sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555 \ + --hash=sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639 \ + --hash=sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786 \ + --hash=sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e \ + --hash=sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed \ + --hash=sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820 \ + --hash=sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8 \ + --hash=sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3 \ + --hash=sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541 \ + --hash=sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14 \ + --hash=sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be \ + --hash=sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e \ + --hash=sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76 \ + --hash=sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b \ + --hash=sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c \ + --hash=sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b \ + --hash=sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3 \ + --hash=sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc \ + --hash=sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6 \ + --hash=sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59 \ + --hash=sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4 \ + --hash=sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d \ + --hash=sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d \ + --hash=sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3 \ + --hash=sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a \ + --hash=sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea \ + --hash=sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6 \ + --hash=sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e \ + --hash=sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603 \ + --hash=sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24 \ + --hash=sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a \ + --hash=sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58 \ + --hash=sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678 \ + --hash=sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a \ + --hash=sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c \ + --hash=sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6 \ + --hash=sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18 \ + --hash=sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174 \ + --hash=sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317 \ + --hash=sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f \ + --hash=sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc \ + --hash=sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837 \ + --hash=sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41 \ + --hash=sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c \ + --hash=sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579 \ + --hash=sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753 \ + --hash=sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8 \ + --hash=sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291 \ + --hash=sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087 \ + --hash=sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866 \ + --hash=sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3 \ + --hash=sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d \ + --hash=sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1 \ + --hash=sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca \ + --hash=sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e \ + --hash=sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db \ + --hash=sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72 \ + --hash=sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d \ + --hash=sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc \ + --hash=sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539 \ + --hash=sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d \ + --hash=sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af \ + --hash=sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b \ + --hash=sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602 \ + --hash=sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f \ + --hash=sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478 \ + --hash=sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c \ + --hash=sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e \ + --hash=sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479 \ + --hash=sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7 \ + --hash=sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8 # via requests -click==8.1.3 +click==8.1.3 \ + --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e \ + --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 # via cookiecutter -cookiecutter==2.1.1 +cookiecutter==2.1.1 \ + --hash=sha256:9f3ab027cec4f70916e28f03470bdb41e637a3ad354b4d65c765d93aad160022 \ + --hash=sha256:f3982be8d9c53dac1261864013fdec7f83afd2e42ede6f6dd069c5e149c540d5 # via taskcluster-taskgraph -giturlparse==0.10.0 +giturlparse==0.10.0 \ + --hash=sha256:04ba1a3a099c3093fa8d24a422913c6a9b2c2cd22bcffc939cf72e3e98f672d7 \ + --hash=sha256:2595ab291d30717cda8474b874c9fd509f1b9802ad7f6968c36a45e4b13eb337 # via mozilla-repo-urls -idna==3.4 +idna==3.4 \ + --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ + --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 # via requests -jinja2==3.1.2 +jinja2==3.1.2 \ + --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \ + --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 # via # cookiecutter # jinja2-time -jinja2-time==0.2.0 +jinja2-time==0.2.0 \ + --hash=sha256:d14eaa4d315e7688daa4969f616f226614350c48730bfa1692d2caebd8c90d40 \ + --hash=sha256:d3eab6605e3ec8b7a0863df09cc1d23714908fa61aa6986a845c20ba488b4efa # via cookiecutter -json-e==4.5.0 +json-e==4.5.0 \ + --hash=sha256:618a94aecc8b8bc7733d6cd0ee7b676e45675566625a38958aa8b30379d9758f \ + --hash=sha256:e733ce77b4acbbc2c48211057f8cb5af45999e6be4ce0f07585c5580df45826e # via taskcluster-taskgraph -markupsafe==2.1.2 +markupsafe==2.1.2 \ + --hash=sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed \ + --hash=sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc \ + --hash=sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2 \ + --hash=sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460 \ + --hash=sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7 \ + --hash=sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0 \ + --hash=sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1 \ + --hash=sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa \ + --hash=sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03 \ + --hash=sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323 \ + --hash=sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65 \ + --hash=sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013 \ + --hash=sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036 \ + --hash=sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f \ + --hash=sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4 \ + --hash=sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419 \ + --hash=sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2 \ + --hash=sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619 \ + --hash=sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a \ + --hash=sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a \ + --hash=sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd \ + --hash=sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7 \ + --hash=sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666 \ + --hash=sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65 \ + --hash=sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859 \ + --hash=sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625 \ + --hash=sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff \ + --hash=sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156 \ + --hash=sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd \ + --hash=sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba \ + --hash=sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f \ + --hash=sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1 \ + --hash=sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094 \ + --hash=sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a \ + --hash=sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513 \ + --hash=sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed \ + --hash=sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d \ + --hash=sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3 \ + --hash=sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147 \ + --hash=sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c \ + --hash=sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603 \ + --hash=sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601 \ + --hash=sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a \ + --hash=sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1 \ + --hash=sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d \ + --hash=sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3 \ + --hash=sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54 \ + --hash=sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2 \ + --hash=sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6 \ + --hash=sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58 # via jinja2 -mozilla-repo-urls==0.1.1 +mozilla-repo-urls==0.1.1 \ + --hash=sha256:30510d3519479aa70211145d0ac9cf6e2fadcb8d30fa3b196bb957bd773502ba \ + --hash=sha256:7364da790751db2a060eb45adbf1d7db89a145ed279ba235f3425db9dd255915 # via taskcluster-taskgraph -python-dateutil==2.8.2 +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via arrow -python-slugify==8.0.1 +python-slugify==8.0.1 \ + --hash=sha256:70ca6ea68fe63ecc8fa4fcf00ae651fc8a5d02d93dcd12ae6d4fc7ca46c4d395 \ + --hash=sha256:ce0d46ddb668b3be82f4ed5e503dbc33dd815d83e2eb6824211310d3fb172a27 # via cookiecutter -pyyaml==6.0 +pyyaml==6.0 \ + --hash=sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf \ + --hash=sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293 \ + --hash=sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b \ + --hash=sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57 \ + --hash=sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b \ + --hash=sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4 \ + --hash=sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07 \ + --hash=sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba \ + --hash=sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9 \ + --hash=sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287 \ + --hash=sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513 \ + --hash=sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0 \ + --hash=sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782 \ + --hash=sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0 \ + --hash=sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92 \ + --hash=sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f \ + --hash=sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2 \ + --hash=sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc \ + --hash=sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1 \ + --hash=sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c \ + --hash=sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86 \ + --hash=sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4 \ + --hash=sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c \ + --hash=sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34 \ + --hash=sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b \ + --hash=sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d \ + --hash=sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c \ + --hash=sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb \ + --hash=sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7 \ + --hash=sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737 \ + --hash=sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3 \ + --hash=sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d \ + --hash=sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358 \ + --hash=sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53 \ + --hash=sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78 \ + --hash=sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803 \ + --hash=sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a \ + --hash=sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f \ + --hash=sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174 \ + --hash=sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5 # via # cookiecutter # taskcluster-taskgraph -redo==2.0.4 +redo==2.0.4 \ + --hash=sha256:81066955041c853b0e6491eb65a0877dce45131c4cfa3d42d923fc2aa8f7a043 \ + --hash=sha256:c76e4c23ab2f8840261736a851323cd98493710e7a9d36a1058535dca501f293 # via taskcluster-taskgraph -requests==2.28.2 +requests==2.28.2 \ + --hash=sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa \ + --hash=sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf # via # cookiecutter # requests-unixsocket # taskcluster-taskgraph -requests-unixsocket==0.3.0 +requests-unixsocket==0.3.0 \ + --hash=sha256:28304283ea9357d45fff58ad5b11e47708cfbf5806817aa59b2a363228ee971e \ + --hash=sha256:c685c680f0809e1b2955339b1e5afc3c0022b3066f4f7eb343f43a6065fc0e5d # via taskcluster-taskgraph -six==1.16.0 +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 # via python-dateutil -slugid==2.0.0 +slugid==2.0.0 \ + --hash=sha256:a950d98b72691178bdd4d6c52743c4a2aa039207cf7a97d71060a111ff9ba297 \ + --hash=sha256:aec8b0e01c4ad32e38e12d609eab3ec912fd129aaf6b2ded0199b56a5f8fd67c # via taskcluster-taskgraph -taskcluster-taskgraph @ git+https://github.com/bhearsum/taskgraph@fetch-multiple-artifacts +taskcluster-taskgraph==5.1.0 \ + --hash=sha256:12d1fa73c9149400458018d78d1c2a33912f290283c16bfaf9bd356051a11dc7 \ + --hash=sha256:93df9c2a2f94411a88788f79fbcc51576003a8d7795af70b195f123ff3886777 # via -r requirements.in -taskcluster-urls==13.0.1 +taskcluster-urls==13.0.1 \ + --hash=sha256:5e25e7e6818e8877178b175ff43d2e6548afad72694aa125f404a7329ece0973 \ + --hash=sha256:b25e122ecec249c4299ac7b20b08db76e3e2025bdaeb699a9d444556de5fd367 \ + --hash=sha256:f66dcbd6572a6216ab65949f0fa0b91f2df647918028436c384e6af5cd12ae2b # via taskcluster-taskgraph -text-unidecode==1.3 +text-unidecode==1.3 \ + --hash=sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8 \ + --hash=sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93 # via python-slugify -urllib3==1.26.14 +urllib3==1.26.14 \ + --hash=sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72 \ + --hash=sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1 # via requests -voluptuous==0.13.1 +voluptuous==0.13.1 \ + --hash=sha256:4b838b185f5951f2d6e8752b68fcf18bd7a9c26ded8f143f92d6d28f3921a3e6 \ + --hash=sha256:e8d31c20601d6773cb14d4c0f42aee29c6821bbd1018039aac7ac5605b489723 # via taskcluster-taskgraph From 7d97d5a40c1dcb8856cb0629fcbfbf5f91746b16 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 3 May 2023 12:09:04 -0400 Subject: [PATCH 22/24] Switch cache attributes to be nested, instead of multiple top level attributes. --- taskcluster/ci/bicleaner/kind.yml | 23 ++++++----- taskcluster/ci/clean/kind.yml | 40 ++++++++++--------- taskcluster/ci/dataset/kind.yml | 28 ++++++++----- .../transforms/cache.py | 7 ++-- 4 files changed, 55 insertions(+), 43 deletions(-) diff --git a/taskcluster/ci/bicleaner/kind.yml b/taskcluster/ci/bicleaner/kind.yml index feff6bd3e..425162dcc 100644 --- a/taskcluster/ci/bicleaner/kind.yml +++ b/taskcluster/ci/bicleaner/kind.yml @@ -23,10 +23,11 @@ kind-dependencies: task-defaults: attributes: - cache-resources: - - pipeline/bicleaner/bicleaner.sh - cache-parameters: - - bicleaner_threshold + cache: + resources: + - pipeline/bicleaner/bicleaner.sh + parameters: + - bicleaner_threshold substitution-fields: - description - name @@ -104,9 +105,10 @@ tasks: treeherder: platform: bicleaner/opt attributes: - cache-type: bicleaner - cache-resources: - - pipeline/bicleaner/requirements/bicleaner.txt + cache: + type: bicleaner + resources: + - pipeline/bicleaner/requirements/bicleaner.txt run: command-context: bicleaner_type: bicleaner @@ -120,9 +122,10 @@ tasks: treeherder: platform: bicleaner-ai/opt attributes: - cache-type: bicleaner-ai - cache-resources: - - pipeline/bicleaner/requirements/bicleaner-ai.txt + cache: + type: bicleaner-ai + resources: + - pipeline/bicleaner/requirements/bicleaner-ai.txt run: command-context: bicleaner_type: bicleaner-ai diff --git a/taskcluster/ci/clean/kind.yml b/taskcluster/ci/clean/kind.yml index 66504a862..6b870a4ff 100644 --- a/taskcluster/ci/clean/kind.yml +++ b/taskcluster/ci/clean/kind.yml @@ -18,13 +18,14 @@ kind-dependencies: task-defaults: description: Clean {provider} {dataset} dataset {src_locale}-{trg_locale} attributes: - cache-type: dataset - cache-resources: - - pipeline/clean/clean-corpus.sh - - pipeline/clean/tools/deescape-special-chars.perl - - pipeline/clean/tools/remove-non-printing-char.perl - - pipeline/clean/tools/clean_parallel.py - - pipeline/clean/tools/langid_fasttext.py + cache: + type: dataset + resources: + - pipeline/clean/clean-corpus.sh + - pipeline/clean/tools/deescape-special-chars.perl + - pipeline/clean/tools/remove-non-printing-char.perl + - pipeline/clean/tools/clean_parallel.py + - pipeline/clean/tools/langid_fasttext.py worker-type: b-linux substitution-fields: - description @@ -78,18 +79,19 @@ tasks: mtdata-{dataset}-{src_locale}-{trg_locale}: provider: mtdata attributes: - cache-resources: - - pipeline/clean/fixes/mtdata_JW300.mt.sh - - pipeline/clean/fixes/mtdata_JW300.sh - - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh - - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh - - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh - - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh - - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh - - pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh - - pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh - - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh - - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh + cache: + resources: + - pipeline/clean/fixes/mtdata_JW300.mt.sh + - pipeline/clean/fixes/mtdata_JW300.sh + - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.ro.sh + - pipeline/clean/fixes/mtdata_neulab_tedtalksv1_train.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.ca.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.es.sh + - pipeline/clean/fixes/mtdata_OPUS_DOGC_v2.sh + - pipeline/clean/fixes/mtdata_OPUS_ECB_v1.sh + - pipeline/clean/fixes/mtdata_OPUS_SETIMES_v2.sh + - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.en.sh + - pipeline/clean/fixes/mtdata_OPUS_UNPC_v1_0.fr.sh news-crawl-{dataset}-{src_locale}-{trg_locale}: provider: news-crawl diff --git a/taskcluster/ci/dataset/kind.yml b/taskcluster/ci/dataset/kind.yml index 897904397..ea5a4cf3e 100644 --- a/taskcluster/ci/dataset/kind.yml +++ b/taskcluster/ci/dataset/kind.yml @@ -18,7 +18,8 @@ transforms: task-defaults: worker-type: b-linux attributes: - cache-type: dataset + cache: + type: dataset substitution-fields: - name - label @@ -44,8 +45,9 @@ tasks: label: dataset-flores-{dataset}-{src_locale}-{trg_locale} provider: flores attributes: - cache-resources: - - pipeline/data/importers/corpus/flores.sh + cache: + resources: + - pipeline/data/importers/corpus/flores.sh run: command: - bash @@ -57,8 +59,9 @@ tasks: label: dataset-sacrebleu-{dataset}-{src_locale}-{trg_locale} provider: sacrebleu attributes: - cache-resources: - - pipeline/data/importers/corpus/sacrebleu.sh + cache: + resources: + - pipeline/data/importers/corpus/sacrebleu.sh run: command: - bash @@ -71,8 +74,9 @@ tasks: label: dataset-opus-{dataset_no_slashes}-{src_locale}-{trg_locale} provider: opus attributes: - cache-resources: - - pipeline/data/importers/corpus/opus.sh + cache: + resources: + - pipeline/data/importers/corpus/opus.sh run: command: - bash @@ -84,8 +88,9 @@ tasks: label: dataset-mtdata-{dataset}-{src_locale}-{trg_locale} provider: mtdata attributes: - cache-resources: - - pipeline/data/importers/corpus/mtdata.sh + cache: + resources: + - pipeline/data/importers/corpus/mtdata.sh run: command: - bash @@ -97,8 +102,9 @@ tasks: label: dataset-news-crawl-{dataset}-{src_locale}-{trg_locale} provider: news-crawl attributes: - cache-resources: - - pipeline/data/importers/mono/news-crawl.sh + cache: + resources: + - pipeline/data/importers/mono/news-crawl.sh run: command: - bash diff --git a/taskcluster/translations_taskgraph/transforms/cache.py b/taskcluster/translations_taskgraph/transforms/cache.py index 1de81972c..03b212753 100644 --- a/taskcluster/translations_taskgraph/transforms/cache.py +++ b/taskcluster/translations_taskgraph/transforms/cache.py @@ -6,9 +6,10 @@ @transforms.add def add_cache(config, jobs): for job in jobs: - cache_type = job["attributes"]["cache-type"] - cache_resources = job["attributes"]["cache-resources"] - cache_parameters = job["attributes"].get("cache-parameters", {}) + cache = job["attributes"]["cache"] + cache_type = cache["type"] + cache_resources = cache["resources"] + cache_parameters = cache.get("parameters", {}) digest_data = [] if cache_resources: From 3553279c583fa25a9be266d73f9543e6d68371fb Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Wed, 3 May 2023 14:54:02 -0400 Subject: [PATCH 23/24] Override compression scheme in pipeline steps. --- taskcluster/ci/bicleaner/kind.yml | 2 ++ taskcluster/ci/clean/kind.yml | 2 ++ taskcluster/ci/dataset/kind.yml | 3 +++ 3 files changed, 7 insertions(+) diff --git a/taskcluster/ci/bicleaner/kind.yml b/taskcluster/ci/bicleaner/kind.yml index 425162dcc..d6c27823c 100644 --- a/taskcluster/ci/bicleaner/kind.yml +++ b/taskcluster/ci/bicleaner/kind.yml @@ -48,6 +48,8 @@ task-defaults: # get interpreted. CUDA_DIR: /builds/worker/fetches/cuda-toolkit CUDNN_DIR: /builds/worker/fetches/cuda-toolkit + COMPRESSION_CMD: zstdmt + ARTIFACT_EXT: zst # Don't run unless explicitly scheduled run-on-tasks-for: [] diff --git a/taskcluster/ci/clean/kind.yml b/taskcluster/ci/clean/kind.yml index 6b870a4ff..42e30b6a6 100644 --- a/taskcluster/ci/clean/kind.yml +++ b/taskcluster/ci/clean/kind.yml @@ -44,6 +44,8 @@ task-defaults: env: SRC: "{src_locale}" TRG: "{trg_locale}" + COMPRESSION_CMD: zstdmt + ARTIFACT_EXT: zst # Don't run unless explicitly scheduled run-on-tasks-for: [] diff --git a/taskcluster/ci/dataset/kind.yml b/taskcluster/ci/dataset/kind.yml index ea5a4cf3e..8ddf61d05 100644 --- a/taskcluster/ci/dataset/kind.yml +++ b/taskcluster/ci/dataset/kind.yml @@ -27,6 +27,9 @@ task-defaults: worker: docker-image: {in-tree: toolchain-build} max-run-time: 1800 + env: + COMPRESSION_CMD: zstdmt + ARTIFACT_EXT: zst artifacts: - name: public/build path: /builds/worker/artifacts From e768cbdda4ba7e16a0458836f4c7717b5c577d06 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Thu, 4 May 2023 13:22:39 -0400 Subject: [PATCH 24/24] Allow training actions to be performed in PRs --- taskcluster/translations_taskgraph/actions/train.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py index 0db7156e7..2c79bd644 100644 --- a/taskcluster/translations_taskgraph/actions/train.py +++ b/taskcluster/translations_taskgraph/actions/train.py @@ -13,7 +13,17 @@ def can_train(parameters): - return parameters["head_repository"] in TRAIN_ON_PROJECTS + # Tasks generated from official repositories should be able to run training. + if parameters["head_repository"] in TRAIN_ON_PROJECTS: + return True + # PRs _to_ official repositories (even if the PR branch is on a fork) should + # also be able to run training. This is important to allow pipeline steps + # to be developed and tested before landing (these do not run automatically + # on push or PR). + if parameters["base_repository"] in TRAIN_ON_PROJECTS and parameters["tasks_for"] == "github-pull-request": + return True + + return False @register_callback_action(