diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 2f9765af24..6746ff98dc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -16,7 +16,7 @@ jobs: unit-tests: strategy: matrix: - pyVersion: ["3.7", "3.8", "3.9", "3.10"] + pyVersion: ["3.10"] fail-fast: false runs-on: ubuntu-22.04 diff --git a/.gitignore b/.gitignore index c972bfbd5b..5079fb8bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,52 @@ +# User Added +.jobenv +**.e[0-9]** +**.o[0-9]** +**.e6** +**.o6** +**.e9** +**.o9** +**.e1** +**.o1** +*.o17* +*.e17* +*.o1 +*.e1 +deps/* +OUTPUTS/* +ALCF/OUTPUTS/* +*tmp* +*core.* +*old* +*.bak +**index-cache** +**pbslogs** +ezpz +*hostfile* .deepspeed_env *.DS_Store +old/* **venv** *.json outputs/ venvs/ wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log +__pycache__ + +.deepspeed_env +*.bak +.cache/* +outputs/ +venvs/ +wandb/ +llama-logs/ checkpoints/ *.gz *.txt diff --git a/ALCF/README.md b/ALCF/README.md new file mode 100644 index 0000000000..b0eb99deb6 --- /dev/null +++ b/ALCF/README.md @@ -0,0 +1,1170 @@ +# Megatron-DeepSpeed @ ALCF + +> [!IMPORTANT] +> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching +> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. + +## 🏃‍♂️ Running + +To launch on {`Polaris`, `Aurora`, `Sunspot`} @ [ALCF](https://alcf.anl.gov): + +1.
⏳ Request an interactive job with qsub -I: + + ```bash + qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I + ``` + + - Or, alternatively, you can submit [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) + directly as a batch script with + + ```bash + cd Megatron-DeepSpeed + qsub -A -q debug -l select=2 -l walltime=01:00:00:filesystems=eagle:home train_llama_alcf.sh + ``` + +
+ +2.
⬇️ Clone repo + navigate into it: + + ```bash + git clone "https://github.com/argonne-lcf/Megatron-DeepSpeed" + cd Megatron-DeepSpeed + ``` + +
+ +3.
🐍 Setup Python: + +
+ + > **NOTE**: The following commands should be ran from [`Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed), following the `cd` command from 2. + + 1. Load `conda` module and activate base environment: + + ```bash + export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && ezpz_setup + ``` + + -
[output]: + +
+ + -
[Polaris]: + + ```bash + # [05:47:13 PM][foremans@x3001c0s13b1n0][/eagle/a/f/p/ar/Megatron-DeepSpeed-D/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + Running on Polaris !! + + Lmod is automatically replacing "nvhpc/23.9" with "gcc-native/12.3". + + + Lmod is automatically replacing "PrgEnv-nvhpc/8.5.0" with "PrgEnv-gnu/8.5.0". + + + Due to MODULEPATH changes, the following have been reloaded: + 1) cray-mpich/8.1.28 + + Found conda at: /soft/applications/conda/2024-04-29/mconda3 + No VIRTUAL_ENV found in environment! + - Trying to setup from /soft/applications/conda/2024-04-29/mconda3 + - Using VENV_DIR=/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29 + - Found existing venv, activating from /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29 + [python] Using: /eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed-DistributedDataLoading/Megatron-DeepSpeed/venvs/2024-04-29/bin/python3 + ``` + +
+ + -
[Aurora]: + + ```bash + # [10:04:02 PM][foremans@x4415c0s2b0n0][/gecko/A/fo/p/a/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + + The following have been reloaded with a version change: + 1) intel_compute_runtime/release/821.36 => intel_compute_runtime/release/803.29 2) oneapi/eng-compiler/2024.04.15.002 => oneapi/release/2024.1 + + Found conda at: /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + No VIRTUAL_ENV found in environment! + - Trying to setup from /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + - Using VENV_DIR=/gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 + - Found existing venv, activating from /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 + [python] Using: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + ``` + +
+ + -
[Sunspot]: + + ```bash + # [05:37:18 PM][foremans@x1921c0s0b0n0][/gila/A/fo/p/a/Megatron-DeepSpeed] + $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_python + Using WORKING_DIR: /gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + No conda_prefix or virtual_env found in environment... + Setting up conda... + Running on SunSpot !! + + Due to MODULEPATH changes, the following have been reloaded: + 1) gcc/12.2.0 5) mpich-config/collective-tuning/1024 + 2) gmp/6.2.1-pcxzkau 6) mpich/icc-all-pmix-gpu/20231026 + 3) mpc/1.3.1-dfagrna 7) oneapi/eng-compiler/2024.04.15.002 + 4) mpfr/4.2.0-w7v7yjv + + The following have been reloaded with a version change: + 1) intel_compute_runtime/release/821.36 => intel_compute_runtime/release/775.20 + 2) spack-pe-gcc/0.7.0-24.086.0 => spack-pe-gcc/0.6.1-23.275.2 + UMD: agama-ci-devel-803.29 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-803.29 + + The following have been reloaded with a version change: + 1) oneapi/eng-compiler/2024.04.15.002 => oneapi/release/2024.04.15.001 + + Found conda at: /soft/datascience/aurora_nre_models_frameworks-2024.1_preview_u1 + No VIRTUAL_ENV found in environment! + - Trying to setup from /soft/datascience/aurora_nre_models_frameworks-2024.1_preview_u1 + - Using VENV_DIR=/gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1 + - Found existing venv, activating from /gila/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1 + [python] Using: /lus/gila/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1_preview_u1/bin/python3 + ``` + +
+ + + + 2. 🍋 Install [`ezpz`](https://github.com/saforem2/ezpz): + + ```bash + mkdir deps && git clone https://github.com/saforem2/ezpz deps/ezpz + python3 -m pip install -e deps/ezpz --require-virtualenv + ``` + + [^venv]: Its generally a good practice to keep separate virtual Python environments different projects. + We provide a helper function, [`setup_venv_from_conda()`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/2f0154394bbdf3c64b4669f9d944645e2cdb8f2b/ALCF/helpers.sh#L440), + that helps take care of this for you. +
+ This will: activate (or build, if necessary) a `venv` in your working dir, + _automatically_ matching the name of your active `conda` environment (e.g. `2024-04-29`, on Polaris_. + + 3. Setup [`wandb`](https://docs.wandb.ai/quickstart) + + > **NOTE**: this can be disabled by setting `export WANDB_DISABLED=1` + +
+ + + +4.
🚀 Launch: + + In this case, train a ~ 2B Model (with 10 layers), + for 1000 iterations using the data file list in: + + [`ALCF/data-lists/polaris/books.txt`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/data-lists/polaris/books.txt) + + with a micro-batch-size of 2 (`MICRO_BATCH=2`), with the `torch.optim.AdamW` optimizer (`OPT=adamw`). + + **Note** that _any_ of the options in the [`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) + function from [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) + can be overridden dynamically at runtime using this technique. + + ```bash + # for systems other than Polaris, replace "polaris/books.txt" below with: + # "{aurora,sunspot}/books.txt", + PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt TRAIN_ITER=1000 NLAYERS=10 MICRO_BATCH=2 OPT=adamw bash train_llama_alcf.sh + ``` + + - **Note**: If no additional options specified, i.e. + + ```bash + PBS_O_WORKDIR=$(pwd) bash train_llama_alcf.sh + ``` + + then this will fallback to using the default AuroraGPT-7B architecture + with the full Dolma (v1.7) dataset. + +
[output]: + +
+ + The outputs should look _something_ like this, though YMMV (things change quick): + +
[Aurora]: + + ```bash + #[🌌][10:45:59 AM][foremans@x4711c1s2b0n0][…/Megatron-DeepSpeed][🌱 main][$!?] + $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup_python + + #[🌌][10:46:57 AM][foremans@x4711c1s2b0n0][…/Megatron-DeepSpeed][🌱 main][$!?][aurora_nre_models_frameworks-2024.1] + (aurora_nre_models_frameworks-2024.1) $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/aurora/books.txt bash train_llama_alcf.sh > train-log-$(tstamp).log 2>&1 & + + Using WORKING_DIR: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + Running on: aurora + Using virtual_env: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1 on top of conda from: /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1 + [python] Using: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + Ensuring all dependencies from /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/requirements/requirements.txt installed... + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + ┌─────────────────────────────────────────────────────────────────────┐ + │ [savejobenv]: + │ • Writing PBS vars to: /home/foremans/.pbsenv + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [HOSTS]: + │ • [host:0] - x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + │ • [host:1] - x4711c1s3b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [DIST INFO]: + │ • HOSTFILE=/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + │ • NHOSTS=2 + │ • NGPU_PER_HOST=12 + │ • NGPUS=24 + └─────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────┐ + │ [LAUNCH]: + │ • To launch across all available GPUs, use: + │ 'launch' ( = mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov ) + └─────────────────────────────────────────────────────────────────────┘ + 2024-06-21 10:47:09,771 - numexpr.utils - INFO - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. + 2024-06-21 10:47:09,772 - numexpr.utils - INFO - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. + 2024-06-21 10:47:09,772 - numexpr.utils - INFO - NumExpr defaulting to 8 threads. + /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or n> + from pandas.core.computation.check import NUMEXPR_INSTALLED + /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1/lib/python3.9/runpy.py:127: RuntimeWarning: 'ezpz.jobs' found in sys.modules after import of package 'ezpz', but prior to execution of 'ezpz.jobs'; this may result in u> + warn(RuntimeWarning(msg)) + [2024-06-21 10:47:10][INFO][jobs:366] - Caught PBS_JOBID='684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov', pbsnf=PosixPath('/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov') from env. Saving jobenv! + [2024-06-21 10:47:10][WARNING][jobs:117] - /home/foremans/PBS-jobs/684084 already in /home/foremans/PBS-jobs.log, not appending !! + [2024-06-21 10:47:10][INFO][jobs:192] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.sh + [2024-06-21 10:47:10][INFO][jobs:220] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.json + [2024-06-21 10:47:10][INFO][jobs:233] - Saving job env to /home/foremans/PBS-jobs/684084/jobenv.yaml + [2024-06-21 10:47:10][INFO][jobs:137] - Saving job env to .jobenv file in /home/foremans/PBS-jobs/684084/.jobenv + [2024-06-21 10:47:10][INFO][jobs:137] - Saving job env to .jobenv file in /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/.jobenv + [2024-06-21 10:47:10][WARNING][jobs:154] - To use launch alias, be sure to: source /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/.jobenv + [2024-06-21 10:47:10][INFO][jobs:277] - Writing PBS env vars to /home/foremans/PBS-jobs/684084 / jobenv{.sh, .yaml, .json} + [2024-06-21 10:47:10][WARNING][jobs:281] - Run: source ./.jobenv in your current shell to set job variables + [2024-06-21 10:47:10][INFO][jobs:374] - + [DIST_INFO]: + • DEVICE=xpu + • DEVICE_ID=xpu:0 + • DISTRIBUTED_BACKEND=ccl + • GPUS_PER_NODE=12 + • HOSTS=['x4711c1s2b0n0', 'x4711c1s3b0n0'] + • HOSTFILE=/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + • HOSTNAME=x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + • LOCAL_RANK=0 + • MACHINE=Aurora + • NUM_NODES=2 + • NGPUS=24 + • NODE_ID=0 + • RANK=0 + • SCHEDULER=PBS + • WORLD_SIZE_TOTAL=24 + • WORLD_SIZE_IN_USE=1 + [2024-06-21 10:47:10][CRITICAL][jobs:245] - To launch across ALL GPUs in your job, use: + LAUNCH_CMD=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + creating alias launch=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov + Found ezpz! + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + Done with ezpz. + Not using flash-attn!! + LR_ARGS: --lr 0.0003 --lr-decay-style cosine --lr-warmup-fraction 0.05 + DS_CONFIG: /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ds-configs/ds_stage1_mb4_gb768_pp1_bf16.json + ZS: 1, MB: 4, GB: 768, PP: 1, DTYPE: bf16 + Please see logs at: logs/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240621-104713_24_x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov + Checkpoints will be saved to: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Setting up tokenizer with Llama2 + Using data_file_list: ./ALCF/data-lists/aurora/books.txt + Using tokenizer: Llama2. Setting up data with ./ALCF/data-lists/aurora/books.txt + Calling: setData() with ./ALCF/data-lists/aurora/books.txt + -------------------- + Updated environment: + DATA_FILE_LIST: ./ALCF/data-lists/aurora/books.txt + NUM_DOCS: 3 + WEIGHT_SUM: 0.0072042092147565125 + DFL_STEM: books + DATA_CACHE_PATH: .cache/books/index-cache + DATA_FLAGS: --data-file-list ./ALCF/data-lists/aurora/books.txt + -------------------- + [setData] DATA_FLAGS: --data-file-list ./ALCF/data-lists/aurora/books.txt + [setData] TOKENIZER_FLAGS: --tokenizer-type Llama2Tokenizer --tokenizer-model /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokenizer.model + Requirement already satisfied: pybind11 in ./venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages (2.12.0) + + [notice] A new release of pip is available: 24.0 -> 24.1 + [notice] To update, run: pip install --upgrade pip + make: Nothing to be done for 'default'. + /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR=/opt/aurora/24.086.0/CNDA/mpich/20231026/mpich-ofi-all-icc-default-pmix-gpu-drop20231026 + - Using /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 + - WORLD_SIZE:24 + - BACKEND: ccl + - MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./ALCF/data-lists/aurora/books.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + + Currently Loaded Modules: + 1) mpich/icc-all-pmix-gpu/20231026 3) libfabric/1.15.2.0 5) cray-libpals/1.3.3 7) gmp/6.2.1-pcxzkau 9) mpc/1.3.1-dfagrna 11) intel_compute_runtime/release/803.29 13) frameworks/2024.1 + 2) mpich-config/collective-tuning/1024 4) cray-pals/1.3.3 6) spack-pe-gcc/0.7.0-24.086.0 8) mpfr/4.2.0-w7v7yjv 10) gcc/12.2.0 12) oneapi/release/2024.1 + + + + Saving environment to checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.env + Not currently running. Continuing! + Launching with: MPICH + mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --genvall --cpu-bind depth -d 16 /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 -Wignore /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py + Using data_cache_path: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache + + mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --genvall --cpu-bind depth -d 16 /gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/venvs/aurora_nre_models_frameworks-2024.1/bin/python3 -Wignore /lus/gecko/projects/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --no-bias-dropout-fusion --no-masked-softmax-fusion --no-gradient-accumulation-fusion > + + [!! NOTE] View output at: + logs/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240621-104713_24_x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log + Connected to tcp://x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov:7919 + Launching application eafe3e80-ad2e-4cee-a3e4-d63af2a77c66 + [2024-06-21 10:47:31,610] [INFO] [comm.py:161:init_deepspeed_backend] Initialize ccl backend + [2024-06-21 10:47:31,610] [INFO] [comm.py:637:init_distributed] cdb=None + [2024-06-21 10:47:31,610] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=15, local_rank=3, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=23, local_rank=11, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=12, local_rank=0, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=13, local_rank=1, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=3, local_rank=3, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=14, local_rank=2, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=16, local_rank=4, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=17, local_rank=5, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=18, local_rank=6, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=19, local_rank=7, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=20, local_rank=8, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=21, local_rank=9, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=22, local_rank=10, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=6, local_rank=6, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=8, local_rank=8, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=10, local_rank=10, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend ccl + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=1, local_rank=1, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=2, local_rank=2, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=4, local_rank=4, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=5, local_rank=5, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=7, local_rank=7, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=9, local_rank=9, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:31,611] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=11, local_rank=11, world_size=24, master_addr=10.115.79.12, master_port=29500 + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=2/23][local_rank=2/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=6/23][local_rank=6/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=8/23][local_rank=8/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=1/23][local_rank=1/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=15/23][local_rank=3/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=17/23][local_rank=5/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=3/23][local_rank=3/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=13/23][local_rank=1/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=14/23][local_rank=2/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=4/23][local_rank=4/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=18/23][local_rank=6/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=20/23][local_rank=8/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=5/23][local_rank=5/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=21/23][local_rank=9/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=7/23][local_rank=7/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=23/23][local_rank=11/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=9/23][local_rank=9/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=12/23][local_rank=0/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=16/23][local_rank=4/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=10/23][local_rank=10/11][node=0/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=11/23][local_rank=11/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=19/23][local_rank=7/11][node=1/1] + [2024-06-21 10:47:32][INFO][dist:291] - [device='xpu'][rank=22/23][local_rank=10/11][node=0/1] + 2024-06-21 10:47:32][INFO][dist:240] - DistInfo={ + "DEVICE": "xpu", + "DEVICE_ID": "xpu:0", + "DISTRIBUTED_BACKEND": "ccl", + "GPUS_PER_NODE": 12, + "HOSTFILE": "/var/spool/pbs/aux/684084.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov", + "HOSTNAME": "x4711c1s2b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov", + "HOSTS": "['x4711c1s2b0n0', 'x4711c1s3b0n0']", + "LOCAL_RANK": 0, + "MACHINE": "Aurora", + "NGPUS": 24, + "NODE_ID": 0, + "NUM_NODES": 2, + "RANK": 0, + "SCHEDULER": "PBS", + "WORLD_SIZE_IN_USE": 24, + "WORLD_SIZE_TOTAL": 24 + } + + # [...clipped...] + + [2024-06-21 10:48:48][INFO][utils:307] - > elapsed time for building blendable dataset indices: 1.19 (sec) + [2024-06-21 10:48:48][INFO][utils:307] - > saving index map files + [2024-06-21 10:48:51][INFO][utils:307] - > finished saving index map files in 3.0829622745513916 seconds + [2024-06-21 10:48:51][INFO][utils:307] - > loading blendable dataset index: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/49e9529a32d0a98f1e40f4a82872b11c_index.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading blendable dataset sample index: checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/49e9529a32d0a98f1e40f4a82872b11c_sample_index.npy + [2024-06-21 10:48:52][INFO][utils:307] - > finished loading in 0.30188989639282227 seconds + [2024-06-21 10:48:52][INFO][utils:307] - >> building dataset for /gecko/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document + [2024-06-21 10:48:52][INFO][utils:307] - > building dataset index ... + [2024-06-21 10:48:52][INFO][utils:307] - reading sizes... + [2024-06-21 10:48:52][INFO][utils:307] - reading pointers... + [2024-06-21 10:48:52][INFO][utils:307] - reading document index... + [2024-06-21 10:48:52][INFO][utils:307] - creating numpy buffer of mmap... + [2024-06-21 10:48:52][INFO][utils:307] - /gecko/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document.bin + [2024-06-21 10:48:52][INFO][utils:307] - creating memory view of numpy buffer... + [2024-06-21 10:48:52][INFO][utils:307] - > finished creating indexed dataset in 0.003112 seconds + [2024-06-21 10:48:52][INFO][utils:307] - number of documents: 7386 + [2024-06-21 10:48:52][INFO][utils:307] - > dataset split: + [2024-06-21 10:48:52][INFO][utils:307] - train: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [0, 7386) total of 7386 documents + [2024-06-21 10:48:52][INFO][utils:307] - validation: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [7386, 7386) total of 0 documents + [2024-06-21 10:48:52][INFO][utils:307] - test: + [2024-06-21 10:48:52][INFO][utils:307] - document indices in [7386, 7386) total of 0 documents + [2024-06-21 10:48:52][INFO][utils:307] - > loading doc-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_doc_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading sample-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_sample_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - > loading shuffle-idx mapping from checkpoints/ws24_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/.cache/books/index-cache/1fa7757ef8907da21e1e1326705e7f3f_shuffle_idx.npy + [2024-06-21 10:48:52][INFO][utils:307] - loaded indexed file in 0.008 seconds + [2024-06-21 10:48:52][INFO][utils:307] - total number of samples: 34196233 + [2024-06-21 10:48:52][INFO][utils:307] - total number of epochs: 175 + [2024-06-21 10:48:52][INFO][utils:307] - > size of blendable dataset: 245361763 samples + [2024-06-21 10:48:52][INFO][utils:307] - >>> Finished building BlendableDataset in 4.613574266433716 seconds + [2024-06-21 10:48:52][INFO][pretrain_gpt_alcf:579] - > finished creating GPT datasets. Took: 45730179865763.24219s + [2024-06-21 10:48:53][INFO][training:88] - [after dataloaders are built] datetime=2024-06-21 10:48:53 + [2024-06-21 10:48:53][INFO][training:307] - done with setup ... + [2024-06-21 10:48:53][INFO][training:313] - training ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (63763.34, 63857.25) + train/valid/test-data-iterators-setup ..........: (12936.53, 13432.64) + [2024-06-21 10:48:53][INFO][training:88] - [before the start of training step] datetime=2024-06-21 10:48:53 + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:541:forward] Activation Checkpointing Information + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:542:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:543:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:545:forward] ----Synchronization False + [2024-06-21 10:48:53,396] [INFO] [checkpointing.py:546:forward] ----Profiling time in checkpointing False + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1867.64 | optimizer_gradients: 19.65 | optimizer_step: 46.07 + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[1.887433467970254e-08, 1.887433467970254e-08], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-06-21 10:50:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 25341.72 | bwd_microstep: 77707.38 | bwd_inner_microstep: 75751.84 | bwd_allreduce_microstep: 1955.54 | step_microstep: 2218.38 + [2024-06-21 10:50:42,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 25341.72 | bwd: 77707.38 | bwd_inner: 75751.84 | bwd_allreduce: 1955.54 | step: 2218.38 + [2024-06-21 10:50:42][INFO][training:1609] - iteration= 1/ 317892 | consumed_samples= 768 | consumed_tokens= 3145728 | elapsed_time_per_iteration_ms=108893.2 | learning_rate=1.88743e-08 | global_batch_size= 768 | lm loss=11.133188 | loss_scale=1.0 | actual_seqlen= 4096 | number_of_skipped_iterations= 0 | number_of_nan_iterations= 0 | samples_per_second=7.053 | tokens_per_gpu_per_second_tgs=1203.674 | [LM]-TFLOPs=49.66 | [DS]-TFLOPs=73.32 | + [2024-06-21 10:50:42][INFO][utils:190] - [Rank 0] (after 1 iterations) memory (MB) | allocated: 18243.64111328125 | max allocated: 50664.2548828125 | reserved: 54556.0 | max reserved: 54556.0 + (min, max) time across ranks (ms): + forward-backward ...............................: (106622.81, 106624.28) + optimizer ......................................: (2221.02, 2234.98) + ``` + +
+ +
[Sunspot]: + + ```bash + # [09:07:32 AM][foremans@x1921c0s0b0n0][~/q/llm.devkit/Megatron-DeepSpeed][🌱 main][$!?] + $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt bash train_llama_alcf.sh + source-ing /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/helpers.sh + Sourcing /home/foremans/q4-drop_sunspot/llm.devkit/setenv.sh... + UMD: agama-ci-devel-736.9 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-736.9 + Lmod has detected the following error: The following module(s) are unknown: "gcc/12.1.0" + + Please check the spelling or version number. Also try "module spider ..." + It is also possible your cache file is out-of-date; it may help to try: + $ module --ignore_cache load "gcc/12.1.0" + + Also make sure that all modulefiles written in TCL start with the string #%Module + + Note: the module "intel_compute_runtime/release/agama-devel-647" cannot be unloaded because it was not loaded. + + Running on SunSpot !! + [python] Using: /home/foremans/miniconda3/envs/q4-drop/bin/python3 + Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env + Found ezpz! + /lus/gila/projects/Aurora_deployment/foremans/locations/sunspot/projects/saforem2/ezpz/src/ezpz/__init__.py + Has ezpz installed. Nothing to do. + Done with ezpz. + ┌─────────────────────────────────────────────────────────────────── + │ Writing PBS vars to /home/foremans/.pbsenv + │ HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 + │ NHOSTS: 2 + │ NGPU_PER_HOST: 12 GPUs per host + │ NGPUS: 24 GPUs total + └─────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Hosts]: + │ • [host:0] - x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com + │ • [host:1] - x1921c0s1b0n0.hostmgmt2000.cm.americas.sgi.com + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [DIST INFO]: + │ • Loading job env from: /home/foremans/.pbsenv + │ • HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 + │ • NHOSTS: 2 + │ • NGPU_PER_HOST: 12 + │ • NGPUS (NHOSTS x NGPU_PER_HOST): 24 + │ • WORLD_SIZE: 24 + │ • DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001 + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Launch]: + │ • Use: 'launch' (=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001) + │ to launch job + └────────────────────────────────────────────────────────────────── + DS_CONFIG: ds_stage2_mb4_gb96_pp1_bf16.json + ZS: 2, CPU_OPTIMIZER: , MB: 4, GB: 96, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Calling: setData() with ./convergence_debug_small.txt + -------------------- + Updated environment: + DATA_FILE_LIST: ./convergence_debug_small.txt + NUM_DOCS: 15 + WEIGHT_SUM: 15.0 + DFL_STEM: convergence_debug_small + DATA_CACHE_PATH: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache + -------------------- + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR= + - Using /home/foremans/miniconda3/envs/q4-drop/bin/python3 + - WORLD_SIZE:24 + - NCCL: nccl + - MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./convergence_debug_small.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + ! Using /home/foremans/miniconda3/envs/q4-drop/bin/deepspeed + /home/foremans/miniconda3/envs/q4-drop/bin/ds_report:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html + __import__('pkg_resources').require('deepspeed==0.12.3+6ea44d02') + /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you dont plan on using image function + ality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torch + vision` from source? + warn( + [2024-04-04 09:07:45,585] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) + [2024-04-04 09:07:45,818] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) + -------------------------------------------------- + DeepSpeed C++/CUDA extension op report + -------------------------------------------------- + NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. + -------------------------------------------------- + JIT compiled ops requires ninja + ninja .................. [OKAY] + -------------------------------------------------- + op name ................ installed .. compatible + -------------------------------------------------- + async_io ............... [NO] ....... [OKAY] + cpu_adagrad ............ [NO] ....... [OKAY] + cpu_adam ............... [NO] ....... [OKAY] + flash_attn ............. [NO] ....... [OKAY] + fused_adam ............. [NO] ....... [OKAY] + quantizer .............. [NO] ....... [OKAY] + transformer ............ [NO] ....... [OKAY] + transformer_inference .. [NO] ....... [OKAY] + utils .................. [NO] ....... [OKAY] + -------------------------------------------------- + DeepSpeed general environment info: + torch install path ............... ['/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch'] + torch version .................... 2.1.0a0+cxx11.abi + deepspeed install path ........... ['/lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/DeepSpeed/deepspeed'] + deepspeed info ................... 0.12.3+6ea44d02, 6ea44d02, HEAD + deepspeed wheel compiled w. ...... torch 2.1 + shared memory (/dev/shm) size .... 503.18 GB + + deepspeed --hostfile /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/gila/projects/Aurora_deployment/ + foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --lr-decay + -style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce-grads-in-fp32 + --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/tensorboard --log-timers-to-tensorboard --log-optimizer + -states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 + --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend ccl --num-attention-heads 32 --save-interval 20 + 0 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 4 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 1 --global-bat + ch-size 96 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ + .cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/tokenizer.model --no-query- + key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --deepspeed-activation-checkpointing --z + ero-stage=2 --deepspeed_config=ds_stage2_mb4_gb96_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint-num-layers 1 |& tee logs/ds_stage2 + _nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + + [!! NOTE] View output at: + logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + + # ... + + /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.010017 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_doc_idx.npy + > loading sample-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_sample_idx.npy + > loading shuffle-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_shuffle_idx.npy + loaded indexed file in 0.056 seconds + total number of samples: 2318461 + total number of epochs: 8 + > loading blendable dataset index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_index.npy + > loading blendable dataset sample index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_sample_index.npy + /home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 2 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. + + [after dataloaders are built] datetime: 2024-04-04 09:09:27 + done with setup ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (64818.18, 64858.22) + train/valid/test-data-iterators-setup ..........: (1968.10, 2288.56) + training ... + [before the start of training step] datetime: 2024-04-04 09:09:27 + [2024-04-04 09:09:27,718] [INFO] [checkpointing.py:540:forward] Activation Checkpointing Information + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:541:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:542:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:544:forward] ----Synchronization False + [2024-04-04 09:09:27,719] [INFO] [checkpointing.py:545:forward] ----Profiling time in checkpointing False + [2024-04-04 09:09:33][INFO][utils:145] - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. + [2024-04-04 09:09:33][INFO][utils:148] - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. + [2024-04-04 09:09:33][INFO][utils:160] - NumExpr defaulting to 8 threads. + ^[c[2024-04-04 09:09:53,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 884.11 | optimizer_gradients: 6.43 | optimizer_step: 23.44 + [2024-04-04 09:09:53,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 6567.68 | bwd_microstep: 17950.36 | bwd_inner_microstep: 17711.20 | bwd_allreduce_microstep: 239.11 | step_microstep: 1139.27 + [2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 6567.66 | bwd: 17950.35 | bwd_inner: 17711.19 | bwd_allreduce: 239.11 | step: 1139.29 + [Rank 0] (after 1 iterations) memory (MB) | allocated: 18244.640625 | max allocated: 41299.50146484375 | reserved: 46764.0 | max reserved: 46764.0 + iteration 1/ 317892 | consumed samples: 96 | consumed tokens: 393216 | elapsed time per iteration (ms): 25849.1 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 1.117136E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.714 | tokens per gpu per second(tgs): 633.832 | TFLOPs: 38.61 | + [2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 327.85 | optimizer_gradients: 6.26 | optimizer_step: 23.60 + [2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 4022.74 | bwd_microstep: 15738.67 | bwd_inner_microstep: 15556.80 | bwd_allreduce_microstep: 181.82 | step_microstep: 371.01 + [2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4022.73 | bwd: 15738.66 | bwd_inner: 15556.62 | bwd_allreduce: 181.81 | step: 371.02 + iteration 2/ 317892 | consumed samples: 192 | consumed tokens: 786432 | elapsed time per iteration (ms): 20298.3 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 2.537718E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 4.729 | tokens per gpu per second(tgs): 807.159 | TFLOPs: 49.17 | + ``` + +
+ +
[Polaris]: + + ```bash + # [09:31:35 AM][foremans@x3112c0s13b0n0][~/pol/p/a/Megatron-DeepSpeed][🌱 main][$!?] + $ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt OPT=adamw bash train_llama_alcf.sh + source-ing /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/helpers.sh + Running on Polaris !! + + [python] Using: /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 + Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env + Found ezpz! + /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py + Has ezpz installed. Nothing to do. + Done with ezpz. + ┌─────────────────────────────────────────────────────────────────── + │ Writing PBS vars to /home/foremans/.pbsenv + │ HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + │ NHOSTS: 2 + │ NGPU_PER_HOST: 4 GPUs per host + │ NGPUS: 8 GPUs total + └─────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Hosts]: + │ • [host:0] - x3112c0s13b0n0.hsn.cm.polaris.alcf.anl.gov + │ • [host:1] - x3112c0s13b1n0.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [DIST INFO]: + │ • Loading job env from: /home/foremans/.pbsenv + │ • HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + │ • NHOSTS: 2 + │ • NGPU_PER_HOST: 4 + │ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 + │ • WORLD_SIZE: 8 + │ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Launch]: + │ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) + │ to launch job + └────────────────────────────────────────────────────────────────── + DS_CONFIG: ds_stage2_mb8_gb32_pp1_bf16.json + ZS: 2, CPU_OPTIMIZER: , MB: 8, GB: 32, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0 + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! + Calling: setData() with "./convergence_debug_small.txt" + -------------------- + Updated environment: + DATA_FILE_LIST: ./convergence_debug_small.txt + NUM_DOCS: 15 + WEIGHT_SUM: 15.0 + DFL_STEM: convergence_debug_small + DATA_CACHE_PATH: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache + -------------------- + ++++++++++++++++++++++++++++++++++++++++++++++++++ + - MPICH_DIR=/opt/cray/pe/mpich/8.1.25/ofi/gnu/9.1 + - Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 + - WORLD_SIZE:8 + - NCCL: nccl + - MODEL_TYPE: llama-seq4096-pp1-tp2-32layers-32heads-4096hidden + - Using DATA_FILE_LIST: ./convergence_debug_small.txt + ++++++++++++++++++++++++++++++++++++++++++++++++++ + ! Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/deepspeed + [2024-04-04 09:35:35,959] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda [auto detect] + -------------------------------------------------- + DeepSpeed C++/CUDA extension op report + -------------------------------------------------- + NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. + -------------------------------------------------- + JIT compiled ops requires ninja + ninja .................. [OKAY] + -------------------------------------------------- + op name ................ installed .. compatible + -------------------------------------------------- + async_io ............... [NO] ....... [OKAY] + fused_adam ............. [NO] ....... [OKAY] + cpu_adam ............... [NO] ....... [OKAY] + cpu_adagrad ............ [NO] ....... [OKAY] + cpu_lion ............... [NO] ....... [OKAY] + [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + evoformer_attn ......... [NO] ....... [NO] + fused_lamb ............. [NO] ....... [OKAY] + fused_lion ............. [NO] ....... [OKAY] + inference_core_ops ..... [NO] ....... [OKAY] + cutlass_ops ............ [NO] ....... [OKAY] + transformer_inference .. [NO] ....... [OKAY] + quantizer .............. [NO] ....... [OKAY] + ragged_device_ops ...... [NO] ....... [OKAY] + ragged_ops ............. [NO] ....... [OKAY] + random_ltd ............. [NO] ....... [OKAY] + [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.2 + [WARNING] using untested triton version (2.2.0), only 1.0.0 is known to be compatible + sparse_attn ............ [NO] ....... [NO] + spatial_inference ...... [NO] ....... [OKAY] + transformer ............ [NO] ....... [OKAY] + stochastic_transformer . [NO] ....... [OKAY] + -------------------------------------------------- + DeepSpeed general environment info: + torch install path ............... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/torch'] + torch version .................... 2.2.1 + deepspeed install path ........... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/deepspeed'] + deepspeed info ................... 0.14.0, unknown, unknown + torch cuda version ............... 11.8 + torch hip version ................ None + nvcc version ..................... 11.8 + deepspeed wheel compiled w. ...... torch 2.2, cuda 11.8 + shared memory (/dev/shm) size .... 251.61 GB + + deepspeed --hostfile /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/eagle/projects/datascienc + e/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion + --lr-decay-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce- + grads-in-fp32 --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/tensorboard --log-timers-to-tensorboard - + -log-optimizer-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_ + pp1_tp2_bf16 --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend nccl --num-attention-heads 32 --s + ave-interval 200 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 8 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 2 + --global-batch-size 32 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-l + cf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokeniz + er.model --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --use-flash-attn-v2 + --deepspeed-activation-checkpointing --zero-stage=2 --deepspeed_config=ds_stage2_mb8_gb32_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint- + num-layers 1 |& tee logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + + [!! NOTE] View output at: + logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + + # ... + + /eagle/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.001280 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_doc_idx.npy + > loading sample-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_sample_idx.npy + > loading shuffle-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_shuffle_idx.npy + loaded indexed file in 0.004 seconds + total number of samples: 869423 + total number of epochs: 3 + > loading blendable dataset index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_index.npy + > loading blendable dataset sample index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_sample_index.npy + > size of blendable dataset: 10223415 samples + > finished creating GPT datasets ... + [after dataloaders are built] datetime: 2024-04-04 09:36:07 + done with setup ... + (min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (4794.78, 4795.23) + train/valid/test-data-iterators-setup ..........: (589.69, 721.20) + training ... + [before the start of training step] datetime: 2024-04-04 09:36:07 + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:539:forward] Activation Checkpointing Information + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:540:forward] ----Partition Activations False, CPU CHECKPOINTING False + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:541:forward] ----contiguous Memory Checkpointing False with 32 total layers + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:543:forward] ----Synchronization False + [2024-04-04 09:36:07,407] [INFO] [checkpointing.py:544:forward] ----Profiling time in checkpointing False + [2024-04-04 09:36:28,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1626.54 | optimizer_gradients: 19.29 | optimizer_step: 419.48 + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 11336.34 | bwd_microstep: 7134.73 | bwd_inner_microstep: 7090.02 | bwd_allreduce_microstep: 44.65 | step_microstep: 2564.02 + [2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 11336.33 | bwd: 7134.75 | bwd_inner: 7090.01 | bwd_allreduce: 44.66 | step: 2564.02 + iteration 1/ 317892 | consumed samples: 32 | consumed tokens: 131072 | elapsed time per iteration (ms): 21133.8 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.119983E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1.514 | tokens per gpu per second(tgs): 775.250 | TFLOPs: 47.23 | + [Rank 1] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 35824.0 + [Rank 0] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 32994.0 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1605.55 | optimizer_gradients: 11.56 | optimizer_step: 50.92 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.17 | bwd_microstep: 6832.48 | bwd_inner_microstep: 6789.73 | bwd_allreduce_microstep: 42.70 | step_microstep: 1867.64 + [2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 1395.15 | bwd: 6832.49 | bwd_inner: 6789.73 | bwd_allreduce: 42.71 | step: 1867.65 + iteration 2/ 317892 | consumed samples: 64 | consumed tokens: 262144 | elapsed time per iteration (ms): 10154.3 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.766422E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.151 | tokens per gpu per second(tgs): 1613.503 | TFLOPs: 98.29 | + + # ... + ``` + +
+ +
+ +
+ + + + + + + + + + + + +### 🚀 Submit as a batch job + +```bash +$ cd Megatron-DeepSpeed +$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home train_llama_alcf.sh +``` + + + +## 📝 Data Preprocessing + +
Data Pre-Processing: + +AuroraGPT is trained on the Dolma dataset (initially v0), now in the process of moving to v6. For more details on the dataset, refer to https://huggingface.co/datasets/allenai/dolma. The dolma dataset downloaded is already preprocessing to remove the duplicates (dedup) and filtering the data (mixing). For more details refer to https://github.com/allenai/dolma/tree/main/docs and https://github.com/vksastry/dolma_alcf/blob/main/ALCF/Readme.md. + +The data preprocessing of Dolma dataset before training consists of tokenization of the data using a specific tokenizer (LlamaTokenizer is what we are currently using), Use the below script to tokenize the entire dataset. Example shown for Polaris. + +``` bash +cd /eagle/datasets/dolma/utils +./tokenization.sh +``` + +
+ +## ✅ TODOs + +
+TODOs: + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + +
Completed + +- Continue runs on Polaris @ + - [x] 48 Nodes + - [x] 32 Nodes + - [x] 16 Nodes + - [x] 8 Nodes + - [x] 4 Nodes + +- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` + - 😔, failed. + +- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: + - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) + +
🐛 Bug + + - Training progresses OK: + + ```bash + [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | + [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | + saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 + # ... + ``` + + - Then crashes with: + + ```python + Traceback (most recent call last): + Traceback (most recent call last): + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in + model = main() + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main + model = pretrain( + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain + iteration = train(forward_step_func, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train + save_checkpoint_and_time(iteration, model, optimizer, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint + state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info + info.update(model[0].universal_checkpoint_info()) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info + info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() + File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' + ``` + + 🤔 +
+ +
+ +
+ +
+ + diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh new file mode 100644 index 0000000000..ffd1471cd3 --- /dev/null +++ b/ALCF/aws_ofi_nccl_plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash --login + +# AWS NCCL OFI Plugin settings below +export NCCL_CROSS_NIC=1 +export NCCL_COLLNET_ENABLE=1 +export NCCL_NET="AWS Libfabric" +export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH +export FI_CXI_DISABLE_HOST_REGISTER=1 +export FI_MR_CACHE_MONITOR=userfaultfd +export FI_CXI_DEFAULT_CQ_SIZE=131072 +######################################################### +# WARNING: !!! +# - Currently, `export NCCL_NET_GDR_LEVEL=PHB` +# causes a hang on Polaris. +# so, we don't set it for the time being [2024-05-14]. +# - Seems to work on Perlmutter ??? +# +# export NCCL_NET_GDR_LEVEL=PHB +######################################################### diff --git a/ALCF/data-lists/aurora/algebraic.txt b/ALCF/data-lists/aurora/algebraic.txt new file mode 100644 index 0000000000..d3685cb42b --- /dev/null +++ b/ALCF/data-lists/aurora/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/aurora/arxiv.txt b/ALCF/data-lists/aurora/arxiv.txt new file mode 100644 index 0000000000..c18c2befd2 --- /dev/null +++ b/ALCF/data-lists/aurora/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/aurora/books.txt b/ALCF/data-lists/aurora/books.txt new file mode 100644 index 0000000000..6f37023596 --- /dev/null +++ b/ALCF/data-lists/aurora/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/aurora/c4.txt b/ALCF/data-lists/aurora/c4.txt new file mode 100644 index 0000000000..7ad92c6086 --- /dev/null +++ b/ALCF/data-lists/aurora/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/aurora/cc.txt b/ALCF/data-lists/aurora/cc.txt new file mode 100644 index 0000000000..174bae9d6a --- /dev/null +++ b/ALCF/data-lists/aurora/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/aurora/dolma.txt b/ALCF/data-lists/aurora/dolma.txt new file mode 100644 index 0000000000..4aba801e00 --- /dev/null +++ b/ALCF/data-lists/aurora/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt b/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..2cc52b55f6 --- /dev/null +++ b/ALCF/data-lists/aurora/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/aurora/falcon.txt b/ALCF/data-lists/aurora/falcon.txt new file mode 100644 index 0000000000..997038fb38 --- /dev/null +++ b/ALCF/data-lists/aurora/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/aurora/megawiki.txt b/ALCF/data-lists/aurora/megawiki.txt new file mode 100644 index 0000000000..635eba3d90 --- /dev/null +++ b/ALCF/data-lists/aurora/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/aurora/open-web-math-train.txt b/ALCF/data-lists/aurora/open-web-math-train.txt new file mode 100644 index 0000000000..e0dfc30bd7 --- /dev/null +++ b/ALCF/data-lists/aurora/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/aurora/pes2o.txt b/ALCF/data-lists/aurora/pes2o.txt new file mode 100644 index 0000000000..3d0cdbe479 --- /dev/null +++ b/ALCF/data-lists/aurora/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/aurora/reddit.txt b/ALCF/data-lists/aurora/reddit.txt new file mode 100644 index 0000000000..ebc1e15ada --- /dev/null +++ b/ALCF/data-lists/aurora/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/aurora/stack.txt b/ALCF/data-lists/aurora/stack.txt new file mode 100644 index 0000000000..d99516f5fb --- /dev/null +++ b/ALCF/data-lists/aurora/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/aurora/starcoder.txt b/ALCF/data-lists/aurora/starcoder.txt new file mode 100644 index 0000000000..2a5be0cf72 --- /dev/null +++ b/ALCF/data-lists/aurora/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/aurora/tulu.txt b/ALCF/data-lists/aurora/tulu.txt new file mode 100644 index 0000000000..46b3a91a40 --- /dev/null +++ b/ALCF/data-lists/aurora/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/aurora/wiki.txt b/ALCF/data-lists/aurora/wiki.txt new file mode 100644 index 0000000000..c70a54f598 --- /dev/null +++ b/ALCF/data-lists/aurora/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /flare/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/polaris/algebraic.txt b/ALCF/data-lists/polaris/algebraic.txt new file mode 100644 index 0000000000..394649fcf4 --- /dev/null +++ b/ALCF/data-lists/polaris/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/polaris/arxiv.txt b/ALCF/data-lists/polaris/arxiv.txt new file mode 100644 index 0000000000..85e59adacd --- /dev/null +++ b/ALCF/data-lists/polaris/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/polaris/books.txt b/ALCF/data-lists/polaris/books.txt new file mode 100644 index 0000000000..c222c32c07 --- /dev/null +++ b/ALCF/data-lists/polaris/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/polaris/c4.txt b/ALCF/data-lists/polaris/c4.txt new file mode 100644 index 0000000000..512556eafb --- /dev/null +++ b/ALCF/data-lists/polaris/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/polaris/cc.txt b/ALCF/data-lists/polaris/cc.txt new file mode 100644 index 0000000000..75485866e6 --- /dev/null +++ b/ALCF/data-lists/polaris/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/polaris/dolma.txt b/ALCF/data-lists/polaris/dolma.txt new file mode 100644 index 0000000000..f2f98ab12c --- /dev/null +++ b/ALCF/data-lists/polaris/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..6f34558ec3 --- /dev/null +++ b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/polaris/falcon.txt b/ALCF/data-lists/polaris/falcon.txt new file mode 100644 index 0000000000..914d4803a4 --- /dev/null +++ b/ALCF/data-lists/polaris/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/polaris/megawiki.txt b/ALCF/data-lists/polaris/megawiki.txt new file mode 100644 index 0000000000..56ec7debc7 --- /dev/null +++ b/ALCF/data-lists/polaris/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/polaris/open-web-math-train.txt b/ALCF/data-lists/polaris/open-web-math-train.txt new file mode 100644 index 0000000000..6d86bd35a4 --- /dev/null +++ b/ALCF/data-lists/polaris/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/polaris/pes2o.txt b/ALCF/data-lists/polaris/pes2o.txt new file mode 100644 index 0000000000..47a7eb3ffd --- /dev/null +++ b/ALCF/data-lists/polaris/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/polaris/reddit.txt b/ALCF/data-lists/polaris/reddit.txt new file mode 100644 index 0000000000..ef79bbc7c8 --- /dev/null +++ b/ALCF/data-lists/polaris/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/polaris/stack.txt b/ALCF/data-lists/polaris/stack.txt new file mode 100644 index 0000000000..a81e55f94a --- /dev/null +++ b/ALCF/data-lists/polaris/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/polaris/starcoder.txt b/ALCF/data-lists/polaris/starcoder.txt new file mode 100644 index 0000000000..5c28dd55b6 --- /dev/null +++ b/ALCF/data-lists/polaris/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/polaris/tulu.txt b/ALCF/data-lists/polaris/tulu.txt new file mode 100644 index 0000000000..e7a681d660 --- /dev/null +++ b/ALCF/data-lists/polaris/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/polaris/wiki.txt b/ALCF/data-lists/polaris/wiki.txt new file mode 100644 index 0000000000..55ba7680ad --- /dev/null +++ b/ALCF/data-lists/polaris/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/sirius/books.txt b/ALCF/data-lists/sirius/books.txt new file mode 100644 index 0000000000..7567ba5227 --- /dev/null +++ b/ALCF/data-lists/sirius/books.txt @@ -0,0 +1,3 @@ +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0000_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0001_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/algebraic.txt b/ALCF/data-lists/sunspot/algebraic.txt new file mode 100644 index 0000000000..f72bf47d74 --- /dev/null +++ b/ALCF/data-lists/sunspot/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train diff --git a/ALCF/data-lists/sunspot/arxiv.txt b/ALCF/data-lists/sunspot/arxiv.txt new file mode 100644 index 0000000000..34972accf4 --- /dev/null +++ b/ALCF/data-lists/sunspot/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv diff --git a/ALCF/data-lists/sunspot/books.txt b/ALCF/data-lists/sunspot/books.txt new file mode 100644 index 0000000000..9502fba1f5 --- /dev/null +++ b/ALCF/data-lists/sunspot/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books diff --git a/ALCF/data-lists/sunspot/c4.txt b/ALCF/data-lists/sunspot/c4.txt new file mode 100644 index 0000000000..ca4836ad81 --- /dev/null +++ b/ALCF/data-lists/sunspot/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 diff --git a/ALCF/data-lists/sunspot/cc.txt b/ALCF/data-lists/sunspot/cc.txt new file mode 100644 index 0000000000..d771efb06a --- /dev/null +++ b/ALCF/data-lists/sunspot/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc + diff --git a/ALCF/data-lists/sunspot/dolma.txt b/ALCF/data-lists/sunspot/dolma.txt new file mode 100644 index 0000000000..7015d885c1 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document algebraic-stack-train +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document algebraic-stack-train +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document algebraic-stack-train +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document algebraic-stack-train +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document algebraic-stack-train +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document algebraic-stack-train +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document algebraic-stack-train +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document algebraic-stack-train +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document algebraic-stack-train +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document algebraic-stack-train +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document algebraic-stack-train +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document algebraic-stack-train +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document algebraic-stack-train +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document algebraic-stack-train +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document algebraic-stack-train +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document algebraic-stack-train +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document arxiv +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document arxiv +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document arxiv +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document arxiv +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document arxiv +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document arxiv +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document arxiv +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document arxiv +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document arxiv +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document arxiv +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document arxiv +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document arxiv +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document arxiv +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document arxiv +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document arxiv +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document arxiv +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document arxiv +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document arxiv +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document arxiv +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document arxiv +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document arxiv +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document arxiv +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document arxiv +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document arxiv +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document arxiv +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document arxiv +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document arxiv +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document arxiv +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document arxiv +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document arxiv +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document arxiv +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document arxiv +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document arxiv +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document arxiv +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document arxiv +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document arxiv +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document arxiv +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document arxiv +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document arxiv +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document arxiv +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document arxiv +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document arxiv +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document arxiv +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document arxiv +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document arxiv +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document arxiv +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document arxiv +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document arxiv +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document arxiv +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document arxiv +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document arxiv +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document arxiv +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document arxiv +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document arxiv +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document arxiv +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document arxiv +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document arxiv +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document arxiv +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document arxiv +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document arxiv +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document arxiv +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document arxiv +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document arxiv +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document arxiv +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document arxiv +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document arxiv +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document arxiv +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document arxiv +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document arxiv +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document arxiv +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document arxiv +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document arxiv +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document arxiv +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document arxiv +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document arxiv +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document arxiv +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document arxiv +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document arxiv +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document arxiv +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document arxiv +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document arxiv +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document arxiv +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document arxiv +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document arxiv +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document arxiv +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document arxiv +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document arxiv +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document arxiv +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document arxiv +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document arxiv +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document arxiv +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document arxiv +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document arxiv +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document arxiv +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document arxiv +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document arxiv +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document arxiv +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document arxiv +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document arxiv +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document arxiv +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document books +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document books +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document books +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document c4 +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document c4 +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document c4 +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document c4 +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document c4 +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document c4 +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document c4 +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document c4 +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document c4 +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document c4 +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document c4 +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document c4 +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document c4 +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document c4 +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document c4 +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document c4 +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document c4 +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document c4 +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document c4 +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document c4 +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document c4 +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document c4 +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document c4 +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document c4 +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document c4 +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document c4 +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document c4 +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document c4 +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document c4 +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document c4 +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document c4 +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document c4 +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document c4 +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document c4 +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document c4 +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document c4 +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document c4 +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document c4 +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document c4 +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document c4 +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document c4 +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document c4 +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document c4 +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document c4 +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document c4 +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document c4 +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document c4 +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document c4 +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document c4 +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document c4 +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document c4 +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document c4 +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document c4 +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document c4 +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document c4 +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document c4 +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document c4 +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document c4 +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document c4 +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document c4 +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document c4 +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document c4 +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document c4 +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document c4 +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document c4 +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document c4 +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document c4 +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document c4 +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document c4 +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document c4 +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document c4 +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document c4 +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document c4 +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document c4 +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document c4 +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document c4 +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document c4 +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document c4 +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document c4 +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document c4 +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document c4 +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document c4 +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document c4 +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document c4 +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document c4 +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document c4 +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document c4 +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document c4 +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document c4 +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document c4 +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document c4 +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document c4 +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document c4 +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document c4 +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document c4 +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document c4 +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document c4 +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document c4 +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document c4 +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document c4 +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document c4 +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document c4 +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document c4 +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document c4 +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document c4 +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document c4 +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document c4 +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document c4 +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document c4 +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document c4 +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document c4 +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document c4 +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document c4 +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document c4 +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document c4 +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document c4 +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document c4 +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document c4 +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document c4 +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document c4 +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document c4 +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document c4 +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document c4 +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document c4 +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document c4 +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document c4 +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document c4 +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document c4 +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document c4 +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document c4 +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document c4 +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document c4 +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document c4 +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document c4 +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document c4 +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document c4 +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document c4 +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document c4 +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document c4 +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document c4 +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document c4 +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document c4 +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document c4 +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document c4 +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document c4 +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document c4 +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document c4 +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document c4 +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document c4 +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document c4 +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document c4 +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document c4 +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document c4 +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document c4 +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document c4 +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document c4 +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document c4 +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document c4 +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document c4 +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document c4 +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document c4 +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document c4 +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document c4 +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document c4 +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document c4 +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document c4 +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document c4 +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document c4 +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document c4 +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document c4 +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document c4 +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document cc +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document cc +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document cc +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document cc +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document cc +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document cc +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document cc +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document cc +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document cc +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document cc +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document cc +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document cc +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document cc +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document cc +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document cc +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document cc +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document cc +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document cc +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document cc +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document cc +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document cc +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document cc +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document cc +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document cc +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document cc +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document cc +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document cc +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document cc +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document cc +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document cc +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document cc +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document cc +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document cc +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document cc +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document cc +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document cc +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document cc +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document cc +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document cc +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document cc +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document cc +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document cc +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document cc +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document cc +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document cc +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document cc +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document cc +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document cc +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document cc +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document cc +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document cc +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document cc +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document cc +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document cc +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document cc +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document cc +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document cc +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document cc +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document cc +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document cc +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document cc +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document cc +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document cc +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document cc +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document cc +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document cc +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document cc +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document cc +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document cc +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document cc +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document cc +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document cc +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document cc +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document cc +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document cc +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document cc +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document cc +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document cc +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document cc +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document cc +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document cc +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document cc +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document cc +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document cc +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document cc +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document cc +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document cc +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document cc +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document cc +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document cc +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document cc +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document cc +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document cc +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document cc +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document cc +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document cc +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document cc +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document cc +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document cc +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document cc +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document cc +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document cc +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document cc +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document cc +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document cc +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document cc +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document cc +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document cc +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document cc +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document cc +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document cc +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document cc +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document cc +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document cc +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document cc +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document cc +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document cc +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document cc +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document cc +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document cc +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document cc +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document cc +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document cc +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document cc +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document cc +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document cc +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document cc +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document cc +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document cc +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document cc +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document cc +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document cc +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document cc +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document cc +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document cc +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document cc +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document cc +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document cc +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document cc +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document cc +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document cc +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document cc +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document cc +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document cc +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document cc +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document cc +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document cc +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document cc +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document cc +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document cc +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document cc +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document cc +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document cc +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document cc +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document cc +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document cc +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document cc +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document cc +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document cc +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document cc +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document cc +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document cc +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document cc +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document cc +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document cc +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document cc +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document cc +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document cc +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document cc +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document cc +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document cc +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document cc +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document cc +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document cc +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document cc +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document cc +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document cc +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document cc +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document cc +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document cc +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document cc +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document cc +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document cc +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document cc +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document cc +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document cc +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document cc +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document cc +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document cc +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document cc +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document cc +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document cc +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document cc +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document cc +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document cc +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document cc +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document cc +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document cc +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document cc +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document cc +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document cc +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document cc +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document cc +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document cc +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document cc +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document cc +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document cc +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document cc +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document cc +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document cc +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document cc +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document cc +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document cc +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document cc +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document cc +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document cc +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document cc +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document cc +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document cc +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document cc +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document cc +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document cc +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document cc +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document cc +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document cc +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document cc +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document cc +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document cc +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document cc +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document cc +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document cc +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document cc +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document cc +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document cc +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document cc +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document cc +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document cc +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document cc +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document cc +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document cc +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document cc +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document cc +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document cc +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document cc +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document cc +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document cc +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document cc +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document cc +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document cc +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document cc +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document cc +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document cc +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document cc +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document cc +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document cc +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document cc +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document cc +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document cc +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document cc +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document cc +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document cc +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document cc +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document cc +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document cc +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document cc +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document cc +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document cc +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document cc +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document cc +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document cc +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document cc +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document cc +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document cc +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document cc +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document cc +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document cc +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document cc +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document cc +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document cc +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document cc +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document cc +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document cc +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document cc +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document cc +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document cc +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document cc +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document cc +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document cc +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document cc +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document cc +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document cc +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document cc +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document cc +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document cc +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document cc +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document cc +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document cc +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document cc +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document cc +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document cc +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document cc +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document cc +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document cc +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document cc +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document cc +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document cc +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document cc +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document cc +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document cc +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document cc +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document cc +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document cc +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document cc +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document cc +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document cc +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document cc +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document cc +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document cc +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document cc +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document cc +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document cc +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document cc +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document cc +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document cc +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document cc +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document cc +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document cc +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document cc +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document cc +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document cc +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document cc +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document cc +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document cc +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document cc +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document cc +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document cc +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document cc +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document cc +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document cc +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document cc +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document cc +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document cc +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document cc +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document cc +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document cc +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document cc +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document cc +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document cc +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document cc +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document cc +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document cc +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document cc +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document cc +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document cc +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document cc +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document cc +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document cc +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document cc +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document cc +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document cc +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document cc +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document cc +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document cc +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document cc +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document cc +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document cc +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document cc +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document cc +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document cc +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document cc +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document cc +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document cc +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document cc +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document cc +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document cc +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document cc +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document cc +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document cc +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document cc +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document cc +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document cc +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document cc +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document cc +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document cc +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document cc +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document cc +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document cc +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document cc +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document cc +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document cc +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document cc +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document cc +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document cc +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document cc +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document cc +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document cc +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document cc +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document cc +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document cc +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document cc +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document cc +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document cc +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document cc +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document cc +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document cc +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document cc +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document cc +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document cc +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document cc +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document cc +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document cc +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document cc +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document cc +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document cc +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document cc +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document cc +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document cc +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document cc +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document cc +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document cc +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document cc +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document cc +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document cc +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document cc +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document cc +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document cc +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document cc +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document cc +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document cc +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document cc +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document cc +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document cc +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document cc +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document cc +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document cc +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document cc +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document cc +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document cc +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document cc +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document cc +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document cc +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document cc +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document cc +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document cc +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document cc +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document cc +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document cc +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document cc +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document cc +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document cc +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document cc +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document cc +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document cc +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document cc +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document cc +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document cc +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document cc +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document cc +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document cc +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document cc +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document cc +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document cc +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document cc +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document cc +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document cc +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document cc +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document cc +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document cc +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document cc +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document cc +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document cc +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document cc +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document cc +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document cc +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document cc +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document cc +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document cc +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document cc +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document cc +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document cc +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document cc +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document cc +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document cc +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document cc +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document cc +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document cc +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document cc +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document cc +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document cc +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document cc +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document cc +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document cc +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document cc +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document cc +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document cc +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document cc +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document cc +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document cc +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document cc +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document cc +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document cc +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document cc +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document cc +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document cc +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document cc +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document cc +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document cc +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document cc +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document cc +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document cc +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document cc +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document cc +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document cc +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document cc +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document cc +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document cc +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document cc +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document cc +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document cc +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document cc +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document cc +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document cc +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document cc +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document cc +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document cc +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document cc +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document cc +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document cc +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document cc +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document cc +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document cc +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document cc +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document cc +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document cc +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document cc +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document cc +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document cc +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document cc +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document cc +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document cc +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document cc +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document cc +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document cc +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document cc +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document cc +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document cc +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document cc +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document cc +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document cc +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document cc +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document cc +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document cc +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document cc +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document cc +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document cc +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document cc +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document cc +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document cc +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document cc +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document cc +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document cc +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document cc +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document cc +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document cc +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document cc +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document cc +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document cc +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document cc +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document cc +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document cc +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document cc +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document cc +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document cc +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document cc +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document cc +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document cc +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document cc +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document cc +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document cc +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document cc +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document cc +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document cc +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document cc +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document cc +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document cc +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document cc +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document cc +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document cc +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document cc +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document cc +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document cc +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document cc +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document cc +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document cc +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document cc +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document cc +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document cc +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document cc +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document cc +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document cc +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document cc +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document cc +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document cc +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document cc +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document cc +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document cc +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document cc +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document cc +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document cc +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document cc +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document cc +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document cc +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document cc +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document cc +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document cc +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document cc +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document cc +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document cc +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document cc +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document cc +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document cc +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document cc +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document cc +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document cc +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document cc +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document cc +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document cc +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document cc +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document cc +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document cc +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document cc +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document cc +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document cc +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document cc +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document cc +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document cc +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document cc +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document cc +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document cc +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document cc +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document cc +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document cc +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document cc +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document cc +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document cc +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document cc +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document cc +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document cc +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document cc +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document cc +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document cc +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document cc +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document cc +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document cc +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document cc +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document cc +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document cc +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document cc +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document cc +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document cc +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document cc +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document cc +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document cc +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document cc +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document cc +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document cc +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document cc +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document cc +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document cc +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document cc +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document cc +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document cc +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document cc +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document cc +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document cc +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document cc +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document cc +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document cc +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document cc +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document cc +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document cc +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document cc +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document cc +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document cc +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document cc +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document cc +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document cc +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document cc +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document cc +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document cc +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document cc +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document cc +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document cc +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document cc +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document cc +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document cc +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document cc +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document cc +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document cc +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document cc +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document cc +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document cc +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document cc +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document cc +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document cc +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document cc +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document cc +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document cc +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document cc +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document cc +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document cc +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document cc +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document cc +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document cc +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document cc +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document cc +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document cc +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document cc +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document cc +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document cc +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document cc +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document cc +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document cc +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document cc +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document cc +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document cc +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document cc +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document cc +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document cc +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document cc +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document cc +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document cc +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document cc +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document cc +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document cc +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document cc +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document cc +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document cc +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document cc +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document cc +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document cc +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document cc +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document cc +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document cc +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document cc +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document cc +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document cc +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document cc +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document cc +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document cc +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document cc +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document cc +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document cc +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document cc +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document cc +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document cc +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document cc +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document cc +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document cc +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document cc +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document cc +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document cc +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document cc +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document cc +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document cc +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document cc +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document cc +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document cc +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document cc +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document cc +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document cc +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document cc +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document cc +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document cc +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document cc +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document cc +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document cc +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document cc +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document cc +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document cc +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document cc +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document cc +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document cc +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document cc +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document cc +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document cc +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document cc +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document cc +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document cc +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document cc +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document cc +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document cc +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document cc +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document cc +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document cc +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document cc +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document cc +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document cc +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document cc +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document cc +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document cc +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document cc +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document cc +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document cc +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document cc +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document cc +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document cc +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document cc +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document cc +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document cc +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document cc +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document cc +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document cc +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document cc +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document cc +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document cc +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document cc +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document cc +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document cc +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document cc +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document cc +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document cc +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document cc +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document cc +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document cc +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document cc +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document cc +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document cc +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document cc +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document cc +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document cc +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document cc +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document cc +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document cc +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document cc +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document cc +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document cc +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document cc +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document cc +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document cc +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document cc +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document cc +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document cc +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document cc +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document cc +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document cc +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document cc +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document cc +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document cc +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document cc +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document cc +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document cc +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document cc +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document cc +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document cc +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document cc +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document cc +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document cc +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document cc +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document cc +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document cc +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document cc +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document cc +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document cc +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document cc +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document cc +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document cc +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document cc +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document cc +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document cc +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document cc +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document cc +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document cc +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document cc +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document cc +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document cc +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document cc +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document cc +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document cc +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document cc +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document cc +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document cc +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document cc +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document cc +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document cc +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document cc +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document cc +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document cc +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document cc +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document cc +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document cc +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document cc +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document cc +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document cc +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document cc +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document cc +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document cc +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document cc +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document cc +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document cc +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document cc +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document cc +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document cc +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document cc +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document cc +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document cc +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document cc +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document cc +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document cc +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document cc +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document cc +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document cc +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document cc +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document cc +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document cc +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document cc +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document cc +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document cc +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document cc +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document cc +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document cc +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document cc +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document cc +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document cc +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document cc +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document cc +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document cc +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document cc +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document cc +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document cc +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document cc +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document cc +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document cc +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document cc +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document cc +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document cc +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document cc +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document cc +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document cc +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document cc +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document cc +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document cc +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document cc +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document cc +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document cc +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document cc +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document cc +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document cc +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document cc +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document cc +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document cc +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document cc +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document cc +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document cc +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document cc +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document cc +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document cc +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document cc +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document cc +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document cc +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document cc +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document cc +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document cc +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document cc +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document cc +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document cc +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document cc +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document cc +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document cc +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document cc +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document cc +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document cc +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document cc +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document cc +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document cc +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document cc +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document cc +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document cc +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document cc +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document cc +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document cc +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document cc +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document cc +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document cc +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document cc +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document cc +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document cc +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document cc +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document cc +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document cc +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document cc +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document cc +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document cc +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document cc +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document cc +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document cc +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document cc +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document cc +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document cc +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document cc +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document cc +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document cc +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document cc +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document cc +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document cc +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document cc +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document cc +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document cc +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document cc +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document cc +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document cc +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document cc +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document cc +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document cc +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document cc +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document cc +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document cc +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document cc +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document cc +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document cc +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document cc +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document cc +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document cc +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document cc +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document cc +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document cc +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document cc +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document cc +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document cc +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document cc +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document cc +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document cc +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document cc +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document cc +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document cc +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document cc +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document cc +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document cc +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document cc +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document cc +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document cc +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document cc +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document cc +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document cc +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document cc +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document cc +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document cc +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document cc +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document cc +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document cc +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document cc +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document cc +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document cc +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document cc +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document cc +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document cc +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document cc +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document cc +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document cc +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document cc +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document cc +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document cc +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document cc +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document cc +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document cc +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document cc +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document cc +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document cc +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document cc +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document cc +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document cc +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document cc +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document cc +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document cc +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document cc +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document cc +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document cc +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document cc +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document cc +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document cc +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document cc +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document cc +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document cc +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document cc +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document cc +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document cc +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document cc +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document cc +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document cc +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document cc +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document cc +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document cc +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document cc +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document cc +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document cc +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document cc +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document cc +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document cc +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document cc +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document cc +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document cc +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document cc +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document cc +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document cc +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document cc +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document cc +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document cc +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..5d142522a7 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/sunspot/falcon.txt b/ALCF/data-lists/sunspot/falcon.txt new file mode 100644 index 0000000000..e5afb89283 --- /dev/null +++ b/ALCF/data-lists/sunspot/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document falcon +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document falcon +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document falcon +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document falcon +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document falcon +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document falcon +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document falcon +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document falcon +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document falcon +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document falcon +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document falcon +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document falcon +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document falcon +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document falcon +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document falcon +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document falcon +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document falcon +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document falcon +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document falcon +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document falcon +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document falcon +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document falcon +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document falcon +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document falcon +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document falcon +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document falcon +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document falcon +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document falcon +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document falcon +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document falcon +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document falcon +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document falcon +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document falcon +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document falcon +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document falcon +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document falcon +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document falcon +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document falcon +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document falcon +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document falcon +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document falcon +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document falcon +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document falcon +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document falcon +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document falcon +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document falcon +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document falcon +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document falcon +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document falcon +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document falcon +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document falcon +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document falcon +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document falcon +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document falcon +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document falcon +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document falcon +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document falcon +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document falcon +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document falcon +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document falcon +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document falcon +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document falcon +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document falcon +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document falcon +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document falcon +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document falcon +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document falcon +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document falcon +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document falcon +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document falcon +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document falcon +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document falcon +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document falcon +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document falcon +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document falcon +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document falcon +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document falcon +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document falcon +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document falcon +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document falcon +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document falcon +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document falcon +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document falcon +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document falcon +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document falcon +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document falcon +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document falcon +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document falcon +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document falcon +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document falcon +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document falcon +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document falcon +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document falcon +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document falcon +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document falcon +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document falcon +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document falcon +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document falcon +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document falcon +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document falcon +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document falcon +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document falcon +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document falcon +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document falcon +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document falcon +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document falcon +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document falcon +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document falcon +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document falcon +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document falcon +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document falcon +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document falcon +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document falcon +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document falcon +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document falcon +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document falcon +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document falcon +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document falcon +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document falcon +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document falcon +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document falcon +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document falcon +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document falcon +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document falcon +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document falcon +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document falcon +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document falcon +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document falcon +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document falcon +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document falcon +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document falcon +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document falcon +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document falcon +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document falcon +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document falcon +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document falcon +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document falcon +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document falcon +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document falcon +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document falcon +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document falcon +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document falcon +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document falcon +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document falcon +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document falcon +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document falcon +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document falcon +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document falcon +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document falcon +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document falcon +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document falcon +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document falcon +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document falcon +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document falcon +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document falcon +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document falcon +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document falcon +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document falcon +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document falcon +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document falcon +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document falcon +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document falcon +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document falcon +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document falcon +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document falcon +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document falcon +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document falcon +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document falcon +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document falcon +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document falcon +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document falcon +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document falcon +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document falcon +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document falcon +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document falcon +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document falcon +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document falcon +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document falcon +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document falcon +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document falcon +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document falcon +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document falcon +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document falcon +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document falcon +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document falcon +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document falcon +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document falcon +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document falcon +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document falcon +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document falcon +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document falcon +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document falcon +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document falcon +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document falcon +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document falcon +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document falcon +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document falcon +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document falcon +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document falcon +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document falcon +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document falcon +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document falcon +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document falcon +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document falcon +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document falcon +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document falcon +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document falcon +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document falcon +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document falcon +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document falcon +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document falcon +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document falcon +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document falcon +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document falcon +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document falcon +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document falcon +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document falcon +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document falcon +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document falcon +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document falcon +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document falcon +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document falcon +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document falcon +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document falcon +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document falcon +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document falcon +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document falcon +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document falcon +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document falcon +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document falcon +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document falcon +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document falcon +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document falcon +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document falcon +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document falcon +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document falcon +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document falcon +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document falcon +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document falcon +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document falcon +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document falcon +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document falcon +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document falcon +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document falcon +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document falcon +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document falcon +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document falcon +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document falcon +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document falcon +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document falcon +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document falcon +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document falcon +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document falcon +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document falcon +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document falcon +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document falcon +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document falcon +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document falcon +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document falcon +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document falcon +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document falcon +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document falcon +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document falcon +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document falcon +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document falcon +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document falcon +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document falcon +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document falcon +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document falcon +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document falcon +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document falcon +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document falcon +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document falcon +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document falcon +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document falcon +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document falcon +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document falcon +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document falcon +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document falcon +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document falcon +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document falcon +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document falcon +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document falcon +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document falcon +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document falcon +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document falcon +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document falcon +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document falcon +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document falcon +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document falcon +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document falcon +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document falcon +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document falcon +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document falcon +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document falcon +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document falcon +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document falcon +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document falcon +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document falcon +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document falcon +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document falcon +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document falcon +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document falcon +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document falcon +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document falcon +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document falcon +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document falcon +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document falcon +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document falcon +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document falcon +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document falcon +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document falcon +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document falcon +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document falcon +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document falcon +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document falcon +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document falcon +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document falcon +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document falcon +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document falcon +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document falcon +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document falcon +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document falcon +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document falcon +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document falcon +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document falcon +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document falcon +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document falcon +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document falcon +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document falcon +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document falcon +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document falcon +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document falcon +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document falcon +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document falcon +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document falcon +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document falcon +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document falcon +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document falcon +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document falcon +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document falcon +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document falcon +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document falcon +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document falcon +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document falcon +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document falcon +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document falcon +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document falcon +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document falcon +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document falcon +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document falcon +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document falcon +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document falcon +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document falcon +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document falcon +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document falcon +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document falcon +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document falcon +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document falcon +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document falcon +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document falcon +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document falcon +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document falcon +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document falcon +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document falcon +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document falcon +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document falcon +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document falcon +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document falcon +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document falcon +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document falcon +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document falcon +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document falcon +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document falcon +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document falcon +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document falcon +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document falcon +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document falcon +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document falcon +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document falcon +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document falcon +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document falcon +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document falcon +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document falcon +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document falcon +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document falcon +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document falcon +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document falcon +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document falcon +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document falcon +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document falcon +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document falcon +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document falcon +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document falcon +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document falcon +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document falcon +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document falcon +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document falcon +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document falcon +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document falcon +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document falcon +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document falcon +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document falcon +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document falcon +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document falcon +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document falcon +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document falcon +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document falcon +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document falcon +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document falcon +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document falcon +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document falcon +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document falcon +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document falcon +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document falcon +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document falcon +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document falcon +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document falcon +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document falcon +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document falcon +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document falcon +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document falcon +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document falcon +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document falcon +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document falcon +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document falcon +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document falcon +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document falcon +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document falcon +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document falcon +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document falcon +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document falcon +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document falcon +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document falcon +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document falcon +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document falcon +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document falcon +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document falcon +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document falcon +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document falcon +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document falcon +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document falcon +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document falcon +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document falcon +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document falcon +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document falcon +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document falcon +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document falcon +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document falcon +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document falcon +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document falcon +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document falcon +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document falcon +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document falcon +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document falcon +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document falcon +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document falcon +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document falcon +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document falcon +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document falcon +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document falcon +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document falcon +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document falcon +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document falcon +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document falcon +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document falcon +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document falcon +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document falcon +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document falcon +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document falcon +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document falcon +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document falcon +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document falcon +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document falcon +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document falcon +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document falcon +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document falcon +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document falcon +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document falcon +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document falcon +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document falcon +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document falcon +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document falcon +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document falcon +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document falcon +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document falcon +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document falcon +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document falcon +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document falcon +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document falcon +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document falcon +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document falcon +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document falcon +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document falcon +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document falcon +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document falcon +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document falcon +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document falcon +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document falcon +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document falcon + diff --git a/ALCF/data-lists/sunspot/megawiki.txt b/ALCF/data-lists/sunspot/megawiki.txt new file mode 100644 index 0000000000..f7fbabc913 --- /dev/null +++ b/ALCF/data-lists/sunspot/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document megawika +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document megawika +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document megawika +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document megawika +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document megawika +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document megawika +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document megawika +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document megawika +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document megawika +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document megawika +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document megawika +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document megawika +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document megawika +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document megawika +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document megawika +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document megawika +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document megawika +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document megawika +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document megawika +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document megawika +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document megawika +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document megawika +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document megawika +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document megawika +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document megawika +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document megawika +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document megawika +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document megawika +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document megawika +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document megawika +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document megawika +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document megawika +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document megawika +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document megawika +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document megawika +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document megawika +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document megawika +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document megawika +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document megawika +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document megawika +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document megawika +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document megawika +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document megawika +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document megawika +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document megawika +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document megawika +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document megawika +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document megawika +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document megawika +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document megawika +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document megawika +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document megawika +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document megawika +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document megawika +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document megawika +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document megawika +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document megawika +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document megawika +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document megawika +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document megawika +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document megawika +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document megawika +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document megawika +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document megawika +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document megawika +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document megawika +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document megawika +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document megawika +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document megawika +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document megawika +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document megawika +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document megawika +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document megawika +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document megawika +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document megawika +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document megawika +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document megawika +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document megawika +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document megawika +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document megawika +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document megawika +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document megawika +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document megawika +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document megawika +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document megawika +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document megawika +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document megawika +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document megawika +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document megawika +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document megawika +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document megawika +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document megawika +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document megawika +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document megawika +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document megawika +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document megawika +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document megawika +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document megawika +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document megawika +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document megawika +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document megawika +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document megawika +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document megawika +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document megawika +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document megawika +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document megawika +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document megawika +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document megawika +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document megawika +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document megawika +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document megawika +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document megawika +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document megawika +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document megawika +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document megawika +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document megawika +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document megawika +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document megawika +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document megawika +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document megawika +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document megawika +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document megawika +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document megawika +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document megawika +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document megawika +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document megawika +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document megawika +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document megawika +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document megawika +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document megawika +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document megawika +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document megawika +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document megawika +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document megawika +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document megawika +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document megawika +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document megawika +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document megawika +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document megawika +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document megawika +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document megawika +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document megawika +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document megawika +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document megawika +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document megawika +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document megawika +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document megawika +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document megawika +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document megawika +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document megawika +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document megawika +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document megawika +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document megawika +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document megawika +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document megawika +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document megawika +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document megawika +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document megawika +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document megawika +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document megawika +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document megawika +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document megawika +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document megawika +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document megawika +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document megawika +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document megawika +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document megawika +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document megawika +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document megawika +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document megawika +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document megawika +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document megawika +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document megawika +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document megawika +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document megawika +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document megawika +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document megawika +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document megawika +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document megawika +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document megawika +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document megawika +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document megawika +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document megawika +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document megawika +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document megawika +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document megawika +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document megawika +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document megawika +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document megawika +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document megawika +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document megawika +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document megawika +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document megawika +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document megawika +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document megawika +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document megawika +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document megawika +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document megawika +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document megawika +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document megawika +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document megawika +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document megawika +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document megawika +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document megawika +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document megawika +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document megawika +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document megawika +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document megawika +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document megawika +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document megawika +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document megawika +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document megawika +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document megawika +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document megawika +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document megawika +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document megawika +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document megawika +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document megawika +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document megawika +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document megawika +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document megawika +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document megawika +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document megawika +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document megawika +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document megawika +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document megawika +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document megawika +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document megawika +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document megawika +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document megawika +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document megawika +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document megawika +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document megawika +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document megawika +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document megawika +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document megawika +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document megawika +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document megawika +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document megawika +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document megawika +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document megawika +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document megawika +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document megawika +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document megawika +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document megawika +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document megawika +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document megawika +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document megawika +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document megawika +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document megawika +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document megawika +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document megawika +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document megawika +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document megawika +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document megawika +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document megawika +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document megawika +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document megawika +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document megawika +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document megawika +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document megawika +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document megawika diff --git a/ALCF/data-lists/sunspot/open-web-math-train.txt b/ALCF/data-lists/sunspot/open-web-math-train.txt new file mode 100644 index 0000000000..ffa745cd76 --- /dev/null +++ b/ALCF/data-lists/sunspot/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document open-web-math-train +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document open-web-math-train +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document open-web-math-train +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document open-web-math-train +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document open-web-math-train +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document open-web-math-train +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document open-web-math-train +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document open-web-math-train +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document open-web-math-train +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document open-web-math-train +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document open-web-math-train +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document open-web-math-train +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document open-web-math-train diff --git a/ALCF/data-lists/sunspot/pes2o.txt b/ALCF/data-lists/sunspot/pes2o.txt new file mode 100644 index 0000000000..a2a1209d8c --- /dev/null +++ b/ALCF/data-lists/sunspot/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document pes2o +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document pes2o +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document pes2o +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document pes2o +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document pes2o +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document pes2o +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document pes2o +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document pes2o +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document pes2o +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document pes2o +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document pes2o +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document pes2o +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document pes2o +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document pes2o +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document pes2o +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document pes2o +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document pes2o +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document pes2o +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document pes2o +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document pes2o +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document pes2o +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document pes2o +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document pes2o +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document pes2o +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document pes2o +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document pes2o diff --git a/ALCF/data-lists/sunspot/reddit.txt b/ALCF/data-lists/sunspot/reddit.txt new file mode 100644 index 0000000000..a1de492a2f --- /dev/null +++ b/ALCF/data-lists/sunspot/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document reddit +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document reddit +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document reddit +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document reddit +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document reddit +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document reddit +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document reddit +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document reddit +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document reddit +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document reddit +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document reddit +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document reddit +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document reddit +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document reddit +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document reddit +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document reddit +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document reddit +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document reddit +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document reddit +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document reddit +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document reddit +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document reddit +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document reddit +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document reddit +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document reddit +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document reddit +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document reddit +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document reddit +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document reddit +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document reddit +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document reddit +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document reddit +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document reddit +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document reddit +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document reddit +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document reddit +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document reddit +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document reddit +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document reddit +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document reddit +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document reddit +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document reddit +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document reddit +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document reddit +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document reddit +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document reddit +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document reddit +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document reddit +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document reddit +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document reddit +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document reddit +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document reddit +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document reddit +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document reddit +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document reddit +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document reddit +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document reddit +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document reddit +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document reddit +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document reddit +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document reddit +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document reddit +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document reddit +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document reddit +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document reddit +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document reddit +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document reddit +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document reddit +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document reddit +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document reddit +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document reddit +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document reddit +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document reddit +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document reddit +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document reddit +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document reddit +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document reddit +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document reddit diff --git a/ALCF/data-lists/sunspot/stack.txt b/ALCF/data-lists/sunspot/stack.txt new file mode 100644 index 0000000000..60cf4451ab --- /dev/null +++ b/ALCF/data-lists/sunspot/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document stackexchange +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document stackexchange +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document stackexchange +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document stackexchange +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document stackexchange +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document stackexchange +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document stackexchange +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document stackexchange +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document stackexchange +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document stackexchange +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document stackexchange +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document stackexchange +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document stackexchange +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document stackexchange +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document stackexchange +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document stackexchange +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document stackexchange +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document stackexchange +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document stackexchange +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document stackexchange +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document stackexchange +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document stackexchange +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document stackexchange +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document stackexchange +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document stackexchange +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document stackexchange diff --git a/ALCF/data-lists/sunspot/starcoder.txt b/ALCF/data-lists/sunspot/starcoder.txt new file mode 100644 index 0000000000..0011e33989 --- /dev/null +++ b/ALCF/data-lists/sunspot/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document starcoder +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document starcoder +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document starcoder +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document starcoder +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document starcoder +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document starcoder +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document starcoder +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document starcoder +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document starcoder +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document starcoder +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document starcoder +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document starcoder +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document starcoder +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document starcoder +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document starcoder +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document starcoder +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document starcoder +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document starcoder +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document starcoder +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document starcoder +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document starcoder +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document starcoder +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document starcoder +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document starcoder +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document starcoder +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document starcoder +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document starcoder +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document starcoder +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document starcoder +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document starcoder +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document starcoder +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document starcoder +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document starcoder +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document starcoder +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document starcoder +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document starcoder +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document starcoder +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document starcoder +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document starcoder +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document starcoder +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document starcoder +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document starcoder +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document starcoder +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document starcoder +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document starcoder +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document starcoder +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document starcoder +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document starcoder +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document starcoder + diff --git a/ALCF/data-lists/sunspot/tulu.txt b/ALCF/data-lists/sunspot/tulu.txt new file mode 100644 index 0000000000..b2e1425784 --- /dev/null +++ b/ALCF/data-lists/sunspot/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document tulu +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document tulu +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document tulu +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document tulu +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document tulu +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document tulu +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document tulu +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document tulu +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document tulu +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document tulu +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document tulu +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document tulu +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document tulu +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document tulu +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document tulu +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document tulu +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document tulu +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document tulu +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document tulu +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document tulu +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document tulu +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document tulu +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document tulu +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document tulu +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document tulu +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document tulu +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document tulu +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document tulu +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document tulu +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document tulu +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document tulu +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document tulu +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document tulu +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document tulu +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document tulu +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document tulu +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document tulu +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document tulu +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document tulu +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document tulu +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document tulu +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document tulu +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document tulu +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document tulu +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document tulu +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document tulu +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document tulu +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document tulu +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document tulu +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document tulu +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document tulu +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document tulu +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document tulu +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document tulu +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document tulu +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document tulu +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document tulu +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document tulu +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document tulu +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document tulu +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document tulu +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document tulu +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document tulu +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document tulu +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document tulu +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document tulu diff --git a/ALCF/data-lists/sunspot/wiki.txt b/ALCF/data-lists/sunspot/wiki.txt new file mode 100644 index 0000000000..172bd0323b --- /dev/null +++ b/ALCF/data-lists/sunspot/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document wiki +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document wiki diff --git a/ALCF/ds_to_universal.py b/ALCF/ds_to_universal.py new file mode 100755 index 0000000000..da7fcb708e --- /dev/null +++ b/ALCF/ds_to_universal.py @@ -0,0 +1,693 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from functools import partial +from itertools import chain +import argparse +import glob +import itertools +import math +from concurrent.futures import ProcessPoolExecutor +import os +import re +import shutil +import torch +import tqdm +# from pprint import pprint + +from deepspeed.checkpoint import DeepSpeedCheckpoint +from deepspeed.checkpoint import ( + OPTIMIZER_STATE_DICT, + ZERO_STAGE, + BASE_OPTIMIZER_STATE, + SINGLE_PARTITION_OF_FP32_GROUPS, + PARAM_GROUPS, + PARAM_SLICE_MAPPINGS, + PARAM_SHAPES, + PARAM, + CAT_DIM, + PARAM_N_SUB_PARAMS, + SUB_PARAM_SHAPE, + VOCAB_TENSOR, + UNIVERSAL_CHECKPOINT_INFO, + UNIVERSAL_CHECKPOINT_VERSION_KEY, + UNIVERSAL_CHECKPOINT_VERSION_VALUE, + VOCABULARY_PARAMETER_PATTERNS, + PIPELINE_REPLICATED_PARAMETER_PATTERNS, + TP_REPLICATED_PARAMETER_PATTERNS, + PARAMETER_TO_AVERAGE_PATTERNS, + PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, + PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, + PARAMETER_WITH_SUB_PARAMS, + SubparamShape, +) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_folder", + type=str, + required=True, + help="Input DeepSpeed Checkpoint folder", + ) + parser.add_argument( + "--output_folder", + type=str, + required=False, + default=None, + help="Output DeepSpeed checkpoint folder", + ) + parser.add_argument( + "--num_extract_workers", + default=4, + type=int, + help="How many parallel processes to extract zero shards", + ) + parser.add_argument( + "--num_merge_workers", + default=2, + type=int, + help="How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))", + ) + parser.add_argument( + "--keep_temp_folder", + action="store_true", + help="Preserve temporary folder of intermediate checkpoint slice files. Useful for debugging.", + ) + parser.add_argument( + "--no_strict", + dest="strict", + action="store_false", + help="Do not perform validity checks on converted checkpoint.", + ) + parser.add_argument( + "--inject_missing_state", + action="store_true", + help="Inject missing checkpoint state into the checkpoint if it is absent.", + ) + args = parser.parse_args() + print(f"args = {args}") + return args + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + """ + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + """ + return [atoi(c) for c in re.split(r"(\d+)", text)] + + +def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): + path_list = [] + iter_folder = f"iter_{iteration:07d}" + for i in range(0, tp_degree): + path_list.append([]) + for j in range(0, pp_degree): + rank_folder = ( + f"mp_rank_{i:02d}" if pp_degree == 1 else f"mp_rank_{i:02d}_{j:03d}" + ) + ckpt_path = os.path.join(rank_folder, "model_optim_rng.pt") + path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path)) + + return path_list + + +def _save_checkpoint(file_path, chkpt_sd): + dir, _ = os.path.split(file_path) + os.makedirs(dir, exist_ok=True) + torch.save(chkpt_sd, file_path) + + +def extract_zero_shards(dir, ds_checkpoint, indices_3D): + pp_index, tp_index, dp_index = indices_3D + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=pp_index, + tp_index=tp_index, + dp_index=dp_index, + strip_tensor_paddings=False, + ) + + # pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}") + + optim_sd = sd[OPTIMIZER_STATE_DICT] + param_slice_mappings = optim_sd[PARAM_SLICE_MAPPINGS] + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + pipeline_replicated_params = universal_checkpoint_info.get( + PIPELINE_REPLICATED_PARAMETER_PATTERNS, [] + ) + # print(f'{pipeline_replicated_params=}') + + # dict + state_groups = optim_sd[BASE_OPTIMIZER_STATE]["state"] + # list + fp32_groups = optim_sd[SINGLE_PARTITION_OF_FP32_GROUPS] + param_groups_cnt = len(state_groups) + + for param_group_id in range(param_groups_cnt): + flat_state = dict( + exp_avg=state_groups[param_group_id]["exp_avg"], + exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"], + fp32=fp32_groups[param_group_id], + ) + + if "step" in state_groups[param_group_id]: + flat_state["step"] = state_groups[param_group_id]["step"] + + for name, fragment_mapping in param_slice_mappings[param_group_id].items(): + if pp_index > 0 and any( + re.match(pattern, name) for pattern in pipeline_replicated_params + ): + # Skip tied weights that are replicated in first and last pp stages + continue + + # pprint(f"dpt{dp_index}{pp_index}{tp_index} {param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") + for state_key in flat_state.keys(): + dump_param_fragment( + dir, + tp_index, + dp_index, + state_key, + flat_state[state_key], + name, + fragment_mapping.start, + fragment_mapping.numel, + ) + + +def extract_zero_shards_stage3( + optim_files, param_shapes, dp_degree, temp_dir, dp_index +): + state_dict = torch.load(optim_files[dp_index], map_location="cpu") + + flat_state = dict( + exp_avg=state_dict[OPTIMIZER_STATE_DICT]["optimizer_state_dict"]["state"][0][ + "exp_avg" + ], + exp_avg_sq=state_dict[OPTIMIZER_STATE_DICT]["optimizer_state_dict"]["state"][0][ + "exp_avg_sq" + ], + fp32=state_dict[OPTIMIZER_STATE_DICT]["fp32_flat_groups"][0], + ) + + offset = 0 + for name, shape in param_shapes.items(): + unpartitioned_numel = shape.numel() + partitioned_numel, _ = _zero_partitioned_param_info( + unpartitioned_numel, dp_degree + ) + padding_free_numel = min( + partitioned_numel, abs(unpartitioned_numel - dp_index * partitioned_numel) + ) + for state_key in flat_state.keys(): + dump_param_fragment( + temp_dir, + 0, + dp_index, + state_key, + flat_state[state_key], + name, + offset, + padding_free_numel, + ) + offset += partitioned_numel + + +cnt = 0 + + +def dp_index_to_str(dp_index): + return f"{dp_index:0>2d}" + + +def dump_param_fragment( + dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel +): + global cnt # temp hack + + param_base_path = os.path.join(dir, param_name, str(tp_index)) + os.makedirs(param_base_path, exist_ok=True) + + cnt += 1 + + path = os.path.join(param_base_path, f"{state_name}.{dp_index_to_str(dp_index)}") + + # print(f"{param_name}: {offset}: {numel} => {path}") + + # State might be a python int or a tensor + if state_name != "step" and torch.is_tensor(state_flat_tensor): + state_flat_tensor = state_flat_tensor.narrow(0, offset, numel).clone() + _save_checkpoint(path, state_flat_tensor) + + +def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None): + slices = [] + for tp_index in range(tp_degree): + prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") + paths = glob.glob(f"{prefix_path}.*") + + if len(paths) == 0: + continue + + pattern = re.compile(f"{prefix_path}\\.([0-9]+)") + dp_indices = set() + for p in paths: + m = pattern.match(p) + if m: + dp_indices.add(int(m.group(1))) + else: + raise ValueError(f"Cannot parse dp_rank from {p}") + + paths = [ + f"{prefix_path}.{dp_index_to_str(dp_index)}" + for dp_index in sorted(list(dp_indices)) + ] + shards = [torch.load(p) for p in paths] + + if state == "step": + assert all( + v == shards[0] for v in shards + ), "All shards must have the same step value" + slice = shards[0] + else: + if slice_shape is None: + slice = torch.cat(shards, dim=0) + else: + slice = torch.cat(shards, dim=0).reshape(slice_shape) + + slices.append(slice) + return slices + + +def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): + name, shape = name_and_shape + slice_base_path = os.path.join(slice_dir, name) + param_base_path = os.path.join(dir, name) + + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + replicated_parameters = universal_checkpoint_info.get( + TP_REPLICATED_PARAMETER_PATTERNS, [] + ) + parameters_to_average = universal_checkpoint_info.get( + PARAMETER_TO_AVERAGE_PATTERNS, [] + ) + parameters_with_row_parallelism = universal_checkpoint_info.get( + PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [] + ) + vocabulary_parameters = universal_checkpoint_info.get( + VOCABULARY_PARAMETER_PATTERNS, [] + ) + parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get( + PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, [] + ) + parameter_with_sub_params = universal_checkpoint_info.get( + PARAMETER_WITH_SUB_PARAMS, [] + ) + + unmatched_patterns = set( + replicated_parameters + + parameters_to_average + + parameters_with_row_parallelism + + vocabulary_parameters + + parameters_with_2_sub_params_cat_dim_0 + ) + unmatched_patterns.update( + chain.from_iterable( + SubparamShape(**s).patterns for s in parameter_with_sub_params + ) + ) + + def get_matched_pattern(patterns_, name_): + matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)] + assert ( + len(matched_) <= 1 + ), f"Got more than one matching patterns={matched_} for {name_}" + if matched_: + pattern_ = matched_[0] + unmatched_patterns.discard(pattern_) + return pattern_ + return None + + def get_matched_sub_params_pattern(name_): + for subparam_shape_dict in parameter_with_sub_params: + subparam_shape = SubparamShape(**subparam_shape_dict) + for pattern_ in subparam_shape.patterns: + if re.match(pattern_, name_): + unmatched_patterns.discard(pattern_) + return subparam_shape + return None + + matched_sub_params_shape = get_matched_sub_params_pattern(name) + + step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) + if step_merged: + _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) + + for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) + final_path = os.path.join(param_base_path, f"{state}.pt") + + # print(f"Expected shape: {shape}") + # print(f"Fragment sizes:", list(frag.shape for frag in slices)) + ckpt_dict = {} + if get_matched_pattern(replicated_parameters, name): + if len(slices) > 1: + assert all([slices[0].equal(other_slice) for other_slice in slices[1:]]) + param = slices[0] + # print(f'replicate {name} using first slice') + elif get_matched_pattern(parameters_to_average, name): + param = sum(slices) / len(slices) + # print(f'merge {name} using average') + elif get_matched_pattern(parameters_with_2_sub_params_cat_dim_0, name): + cat_dim = 0 + chunked_slices = [torch.chunk(s, 2, dim=cat_dim) for s in slices] + merged_chunks_0 = torch.cat([s[0] for s in chunked_slices], dim=cat_dim) + merged_chunks_1 = torch.cat([s[1] for s in chunked_slices], dim=cat_dim) + param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim) + ckpt_dict[CAT_DIM] = cat_dim + ckpt_dict[PARAM_N_SUB_PARAMS] = 2 + elif matched_sub_params_shape: + merged_chunks = [] + partition_dim = matched_sub_params_shape.partition_dim + + sub_dim_sizes = matched_sub_params_shape.shape[partition_dim] + if not isinstance(sub_dim_sizes, tuple): + sub_dim_sizes = (sub_dim_sizes,) + + partition_shape = [ + sum(d) if isinstance(d, tuple) else d + for d in matched_sub_params_shape.shape + ] + partition_shape = [ + d // tp_degree if i == partition_dim else d + for i, d in enumerate(partition_shape) + ] + slices = [s.view(partition_shape) for s in slices] + + offset = 0 + for sub_dim_size in sub_dim_sizes: + part_sub_dim_size = sub_dim_size // tp_degree + merged_chunks.append( + torch.cat( + [ + s.narrow(partition_dim, offset, part_sub_dim_size) + for s in slices + ], + dim=partition_dim, + ) + ) + offset += part_sub_dim_size + param = torch.cat(merged_chunks, dim=partition_dim) + ckpt_dict[SUB_PARAM_SHAPE] = matched_sub_params_shape + else: + cat_dim = ( + 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0 + ) + # print(f"merge {name} with CAT DIM: {cat_dim}") + param = torch.cat(slices, dim=cat_dim) + ckpt_dict[CAT_DIM] = cat_dim + + if get_matched_pattern(vocabulary_parameters, name): + # print(f"Before {param.shape=}") + # strip padding + original_vocab_size = universal_checkpoint_info["original_vocab_size"] + param = param[:original_vocab_size, :] + ckpt_dict[VOCAB_TENSOR] = True + # print(f"After {param.shape=}") + + # print(f"Final shape: {param.shape}") + ckpt_dict[PARAM] = param + _save_checkpoint(final_path, ckpt_dict) + + return unmatched_patterns + + +def merge_zero3_slices(dp_degree, dir, slice_dir, name): + slice_base_path = os.path.join(slice_dir, name) + param_base_path = os.path.join(dir, name) + + for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, 1) + final_path = os.path.join(param_base_path, f"{state}.pt") + _save_checkpoint(final_path, slices[0]) + + +def _do_parallel_work(do_work, work_chunks, num_workers): + results = [] + if num_workers > 1: + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_list = [executor.submit(do_work, work) for work in work_chunks] + for f in tqdm.tqdm(future_list): + results.append(f.result()) + else: + # No parallel pass for unit testing + # We can't create child processes in tests + for work in tqdm.tqdm(work_chunks): + results.append(do_work(work)) + return results + + +def _extract_zero_shard_files(args, ds_checkpoint, temp_dir): + _3d_range_list = list( + itertools.product( + range(ds_checkpoint.pp_degree), + range(ds_checkpoint.tp_degree), + range(ds_checkpoint.dp_degree), + ) + ) + # pprint(f'{_3d_range_list=}') + + do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint) + _do_parallel_work(do_work, _3d_range_list, args.num_extract_workers) + + +def _extract_zero_shard_files_stage3( + args, optim_files, param_shapes, dp_degree, temp_dir +): + do_work = partial( + extract_zero_shards_stage3, optim_files, param_shapes, dp_degree, temp_dir + ) + _do_parallel_work(do_work, list(range(dp_degree)), args.num_extract_workers) + + +def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir): + zero_output_folder = os.path.join(args.output_folder, "zero") + do_work = partial( + merge_tp_slices, + ds_checkpoint, + zero_output_folder, + temp_dir, + ds_checkpoint.tp_degree, + ) + unmatched_patterns_lists = _do_parallel_work( + do_work, list(slice_shapes.items()), args.num_merge_workers + ) + + # verify that all patterns were used + # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert + sets = [set(lst) for lst in unmatched_patterns_lists] + unmatched_patterns = list(set.intersection(*sets)) + if args.strict: + assert ( + not unmatched_patterns + ), f"Unused patterns={unmatched_patterns} while merging tp slices" + elif unmatched_patterns: + print(f"Warning: Unused patterns={unmatched_patterns} while merging tp slices") + + +def _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir): + zero_output_folder = os.path.join(args.output_folder, "zero") + do_work = partial(merge_zero3_slices, dp_degree, zero_output_folder, temp_dir) + _do_parallel_work(do_work, param_shapes.keys(), args.num_merge_workers) + + +def _zero_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _parse_model_states_stage3(files): + return torch.load(files[0], map_location=torch.device("cpu"))[PARAM_SHAPES] + + +def _save_optimizer_state(args, ds_checkpoint): + sharded_states = [ + BASE_OPTIMIZER_STATE, + PARAM_SLICE_MAPPINGS, + SINGLE_PARTITION_OF_FP32_GROUPS, + ] + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=0, tp_index=0, dp_index=0, strip_tensor_paddings=False + ) + + optim_sd = sd[OPTIMIZER_STATE_DICT] + output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states} + output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS] + zero_output_folder = os.path.join(args.output_folder, "zero") + output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + _save_checkpoint(output_file_path, output_sd) + + +def _save_optimizer_state_stage3(args, optim_files): + sd = torch.load(optim_files[0], map_location=torch.device("cpu")) + output_sd = sd[OPTIMIZER_STATE_DICT] + output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS] + zero_output_folder = os.path.join(args.output_folder, "zero") + output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") + _save_checkpoint(output_file_path, output_sd) + + +def _get_optim_files(checkpoint_dir): + return _get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def _get_model_state_files(checkpoint_dir): + return _get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def _get_checkpoint_files(checkpoint_dir, glob_pattern): + ckpt_files = sorted( + glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys + ) + + if len(ckpt_files) == 0: + raise FileNotFoundError( + f"can't find {glob_pattern} files in directory '{checkpoint_dir}'" + ) + + return ckpt_files + + +def _get_zero_stage(optim_files): + state_dict = torch.load(optim_files[0], map_location=torch.device("cpu")) + optimizer_state = state_dict[OPTIMIZER_STATE_DICT] + zero_stage = optimizer_state.get(ZERO_STAGE, 1) + return zero_stage + + +def _inject_missing_state(ds_checkpoint): + if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state: + sd = torch.load( + ds_checkpoint.mp_rank_files[0], map_location=torch.device("cpu") + ) + if UNIVERSAL_CHECKPOINT_INFO not in sd: + ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {} + ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][ + UNIVERSAL_CHECKPOINT_VERSION_KEY + ] = UNIVERSAL_CHECKPOINT_VERSION_VALUE + + +def _check_for_required_state(ds_checkpoint): + universal_checkpoint_info = ds_checkpoint.get_checkpoint_info( + UNIVERSAL_CHECKPOINT_INFO + ) + assert ( + universal_checkpoint_info is not None + ), f"Required {UNIVERSAL_CHECKPOINT_INFO} state is missing in checkpoint. Verify that client creates this state." + + +def main(args): + print(f"Convert DeepSpeed Checkpoint to Universal Checkpoint") + + print( + f"Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}" + ) + + optim_files = _get_optim_files(args.input_folder) + zero_stage = _get_zero_stage(optim_files) + + if zero_stage <= 2: + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder) + if args.inject_missing_state: + _inject_missing_state(ds_checkpoint) + else: + _check_for_required_state(ds_checkpoint) + + iteration = ds_checkpoint.get_iteration() + # _create_latest_file(args.output_folder, iteration) + checkpoint_paths = _create_checkpoint_paths( + args.output_folder, + iteration, + ds_checkpoint.tp_degree, + ds_checkpoint.pp_degree, + ) + + slice_shapes = [] + for mp_rank_file in ds_checkpoint.mp_rank_files: + mp_sd = torch.load(mp_rank_file, map_location=torch.device("cpu")) + slice_shapes += mp_sd[PARAM_SHAPES] + + # fix back to normal flat dict, merge duplicates for tp>1 + slice_shapes = dict((k, v) for d in slice_shapes for k, v in d.items()) + temp_dir = os.path.join(args.output_folder, "tmp") + + print("1. Extracting ZeRO fragments") + _extract_zero_shard_files(args, ds_checkpoint, temp_dir) + + print("2. Merging slices .....") + _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir) + + print("3. Saving common optimizer states") + _save_optimizer_state(args, ds_checkpoint) + + if not args.keep_temp_folder: + shutil.rmtree(temp_dir, ignore_errors=True) + + # Copy mp* files into output folder + for f in glob.glob(os.path.join(args.input_folder, "mp*")): + shutil.copy2(f, args.output_folder) + + else: + model_files = _get_model_state_files(args.input_folder) + param_shapes = _parse_model_states_stage3(model_files) + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + dp_degree = len(model_files) + + temp_dir = os.path.join(args.output_folder, "tmp") + + print("*** 1. Extracting ZeRO fragments") + _extract_zero_shard_files_stage3( + args, optim_files, param_shapes, dp_degree, temp_dir + ) + + print("*** 2. Merging slices .....") + _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir) + + print("*** 3. Saving common optimizer states") + _save_optimizer_state_stage3(args, optim_files) + + if not args.keep_temp_folder: + shutil.rmtree(temp_dir, ignore_errors=True) + + # Copy *model_states files into output folder + for f in glob.glob(os.path.join(args.input_folder, "*model_states.pt")): + shutil.copy2(f, args.output_folder) + + # Update latest to output folder + checkpoint_root_folder, step_folder = os.path.split(args.output_folder) + latest_file = os.path.join(checkpoint_root_folder, "latest_universal") + with open(latest_file, "w") as f: + f.write(step_folder) + + print("*** Done!") + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/ALCF/fused_stackcode.py b/ALCF/fused_stackcode.py new file mode 100644 index 0000000000..28c1e5c694 --- /dev/null +++ b/ALCF/fused_stackcode.py @@ -0,0 +1,36 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done") + +directory_path = "./data/stack-code/" +folder_count = 0 +for folder in os.listdir(directory_path): + print(f"working for folder {folder} {os.path.join(directory_path, folder)}") + folder_count = folder_count + 1 + json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) + out_path = os.path.join("./fused_stack", folder) + os.makedirs(out_path, exist_ok=True) + output_file = os.path.join(out_path, 'fused.json.gz') + combine_json_gz_files(json_gz_files, output_file) + diff --git a/ALCF/fused_stackcode_bysize.py b/ALCF/fused_stackcode_bysize.py new file mode 100644 index 0000000000..d838369e6b --- /dev/null +++ b/ALCF/fused_stackcode_bysize.py @@ -0,0 +1,64 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done ?") + +directory_path = "./fused_stack/" +out_path = "./fused_by_size" +folder_count = 0 +file_list = list_json_gz_files(directory_path) +size_dict = {} +for efile in file_list: + size_of_files = os.stat(efile) + size_dict[efile] = size_of_files.st_size / (1024*1024) # in MBs + +sorted_size_dict = dict(sorted(size_dict.items(), key=lambda item: item[1])) +vol = 0 +sublist = [] +super_list = {} +i=1 +for key, val in sorted_size_dict.items(): + if vol + val > 4608: + # add this item to list and reset vol, sublist + vol = 0 + sublist.append(key) + #print(sublist) + print("************") + super_list[i] = sublist + output_file = out_path + "/fused_stack_" + str(i) + ".json.gz" + print(output_file) + combine_json_gz_files(sublist, output_file) + sublist = [] + i=i+1 + else: + vol = vol + val + sublist.append(key) +#print(t) +#for folder in os.listdir(directory_path): +# print(f"working for folder {folder} {os.path.join(directory_path, folder)}") +# folder_count = folder_count + 1 +# json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) +# out_path = os.path.join("./fused_stack", folder) +# os.makedirs(out_path, exist_ok=True) +# output_file = os.path.join(out_path, 'fused.json.gz') +# combine_json_gz_files(json_gz_files, output_file) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh new file mode 100644 index 0000000000..252e641fe1 --- /dev/null +++ b/ALCF/helpers.sh @@ -0,0 +1,1486 @@ +#!/bin/bash --login +############################################################################### +# [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh) +# +# Contains helper functions for launching `../train_llama_alcf.sh` +# +# To use, on any of {Polaris, Aurora, Sunspot} @ ALCF: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup +# ``` +# +# and this will, automatically: +# +# 1. Setup python (conda + virtual environment) +# +# 2. Parse `$PBS_*` env vars to build appropriate `alias launch='mpiexec ...'` +# command for launching across all GPUs in our active PBS job. +############################################################################### + +################## +# helpers_main +# +# This will get called automatically when running: +# +# ```bash +# $ cd Megatron-DeepSpeed +# $ PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh +# ``` +# +# - This will set `"${WORKING_DIR}"`, according to: +# +# 1. if `${PBS_O_WORKDIR}` is nonzero, use this +# 2. else, if `${SLURM_SUBMIT_DIR}` is nonzero use this +# 3. else, use `$(pwd)` +# +# this is _crucial_ since many of the functions below use paths +# which are defined relative to this "${WORKING_DIR}" +# (e.g. virtual environment, location of executables, etc.) +################## +helpers_main() { + # NOTE: for debug mode, run with `DEBUG=1` + if [[ -n "${DEBUG:-}" ]]; then + set -euxo + fi + if [[ -n "${PBS_O_WORKDIR}" ]]; then + WORKING_DIR="${PBS_O_WORKDIR}" + elif [[ -n "${SLURM_SUBMIT_DIR}" ]]; then + WORKING_DIR="${SLURM_SUBMIT_DIR}" + else + echo "Unable to detect PBS or SLURM working directory info..." + WORKING_DIR=$(python3 -c 'import os; print(os.getcwd())') + echo "Using ${WORKING_DIR} as working directory..." + fi + export WORKING_DIR="${WORKING_DIR}" + printf "Using WORKING_DIR: %s\n" "${WORKING_DIR}" +} + +############################################################################## +# setup +# +# All-in-one helper function. +# +# - Explicitly, this will: +# - Identify the machine we're on +# +# - Setup `python` +# 1. Load `conda` +# 2. Setup `venv` on top of `conda` +# +# - Ensure all dependencies are installed +# +# - Clone + Install [`saforem2/ezpz`](https://github.com/saforem2/ezpz) +# - Source [`ezpz/utils.sh`](https://github.com/saforem2/ezpz/blob/main/src/ezpz/bin/utils.sh) +# - This provides `{ezpz_setup_python, ezpz_setup_job}` (called below) +# +# - Set runtime options +# +# - Build `deepspeed_config.json` +# +# - Build {logs, checkpoints, etc} dirs, named according to specifics of +# current run +# +# - Specify additional `deepspeed` arguments +# +# - Ensure executable exists at expected path +# +# - Setup data + tokenizer via `TOKENIZER_TYPE` +# +# - Print job info +# +# - Save `.env` to `CKPT_DIR` for safe keeping +# +# - Check that we're not already running, and if so, exit. +# +# - Setup run command to be executed. +############################################################################## +setup() { + # Identify machine we're on + get_machine || exit + ########################################################################## + # ezpz_setup will: + # 1. Setup python + # - load base conda + # - (if necessary) create virtual environment on top of base conda + # - activate virtual environment from ^ + # 2. Install ezpz (if needed) + # 3. Parse PBS_* environment variables to determine: + # - NHOSTS (by counting number of lines in $PBS_NODEFILE) + # - NGPU_PER_HOST (by magic) + # - NGPUS (= NHOSTS * NGPU_PER_HOST) + # 4. Use these (^) to build our launch command + ezpz_setup || exit + ########################################################################## + install_dependencies + # Set command line arguments to pass to `"${EXEC}"` + setParams || exit + # Create `deepspeed_config.json` from runtime params from ^ + buildDSconfig || exit + # Specify output directory for {logs, checkpoints, etc.} + setup_checkpoint || exit + setOutput || exit + # Specify additional `deepspeed` arguments (dependent on _newly created_ variables) + set_args || exit + # Ensure executable exists in expected path + check_executable "${EXEC:-${WORKING_DIR}/pretrain_gpt_alcf.py}" + dfl="${DATA_FILE_LIST:-"${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt"}" + # Setup data + tokenizer via `DATA_FILE_LIST` and `TOKENIZER_TYPE` + tok="${TOKENIZER_TYPE:-Llama2Tokenizer}" + setup_tokenizer_and_data "${tok}" "${dfl}" || exit + make_data || exit + # Print job info + printJobInfo || exit + # Save `.env` to `CKPT_DIR` for safe keeping + save_dotenv "${CKPT_DIR}" || exit + # Check that were not already running, if so, exit. + check_and_kill_if_running || exit + # Setup run command to be executed + setup_run_cmd "$@" || exit +} + +##################################################### +# setup_run_cmd +# +# Build run command to be executed. +##################################################### +setup_run_cmd() { + ############################## + # take in additional arguments + # and append them directly to + # the end of the `run_cmd` + # custom_args="$@" + custom_args=("$@") + ############################## + #### Make it easy to track experiments by date ################### + year="$(date "+%Y")" + month="$(date "+%m")" + day="$(date "+%Y-%m-%d")" + today="$(date "+%Y-%m-%d")" # kept for backwards compatibility + started_at="$(date "+%Y-%m-%d-%H%M%S")" + export YEAR="${year}" + export MONTH="${month}" + export DAY="${day}" + export TODAY="${today}" + export STARTED_AT="${started_at}" + ################################################################## + # NOTE: to launch with DeepSpeed instead of mpiexec: + # `export LAUNCH_WITH=deepspeeed && bash train_llama_alcf.sh` + ################################################################## + setupLauncher "${LAUNCH_WITH:-MPICH}" || exit + export data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" + printf "\n" + echo "Using data_cache_path: ${data_cache_path}" + ################################################################## + # WARN: to disable Llama-type architectures, toggle via: + # `NO_LLAMA=1 bash train_llama_alcf.sh` + ################################################################## + if [[ -z "${NO_LLAMA:-}" ]]; then + llama_flags=( + "--swiglu" + "--hidden-dropout 0" + "--attention-dropout 0" + "--normalization rmsnorm" + "--disable-bias-linear" + "--no-query-key-layer-scaling" + "--use-rotary-position-embeddings" + "--untie-embeddings-and-output-weights" + "--num-key-value-heads ${NUM_KV_HEAD}" + "--ffn-hidden-size ${FFN_HIDDEN_SIZE}" + ) + fi + # min_lr=$(python3 -c 'print(f"{2 / (10 ** 5):.8f}")') + # "--min-lr ${LR:-${min_lr}}" # 2e-5 + # "--min-lr ${MIN_LR:-"2e-6"}" # 2e-5 + lr_flags=( + "--lr ${LR:-0.0002}" + "--lr-decay-style ${LR_DECAY_STYLE:-cosine}" + "--lr-warmup-fraction ${LR_WARMUP_FRAC:-0.05}" + ) + if [[ -n "${LR_DECAY_ITERS:-}" ]]; then + lr_flags+=("--lr-decay-iters ${LR_DECAY_ITERS:-}") + fi + + tb_flags=() + if [[ -z "${NO_TENSORBOARD:-}" ]]; then + TBDIR="${CKPT_DIR}/tensorboard" + mkdir -p "${TBDIR}" + tb_flags+=( + "--log-timers-to-tensorboard" + "--log-optimizer-states-to-tensorboard" + "--tensorboard-dir ${TBDIR}" + ) + fi + dfl_fallback="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt}" + + train_args=() + if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then + train_args+=("--use-checkpoint-opt_param-scheduler") + fi + # "--init-method-std ${INIT_METHOD_STD:-0.0006}" + # "--shuffle-sample" + train_args+=( + "${lr_flags[@]}" + "${custom_args[@]}" + "${llama_flags[@]}" + "${DATA_FLAGS}" + "${FLASH_ARG}" + "${TIMING_STR}" + "${TOKENIZER_FLAGS}" + "${tb_flags[@]}" + "${ds_args[@]}" + "${gpt_args[@]}" + "--${DTYPE}" + "--shuffle-sample-in-corpus" + "--blend-sample-in-corpus" + "--accumulate-allreduce-grads-in-fp32" + "--no-bias-gelu-fusion" + "--no-bias-dropout-fusion" + "--no-masked-softmax-fusion" + "--no-gradient-accumulation-fusion" + "--optimizer=${OPT}" + "--tensor-model-parallel-size=${TP}" + "--pipeline-model-parallel-size=${PP}" + "--max-position-embeddings=${SEQ}" + "--micro-batch-size=${MICRO_BATCH}" + "--ds-sequence-parallel-size=${SP}" + "--global-batch-size=${GLOBAL_BATCH}" + "--split=${TRAIN_SPLIT:-990},${VAL_SPLIT:-10},${TEST_SPLIT:-0}" + "--timing-log-level=${TIMING_LOG_LEVEL:-1}" + "--eval-interval=${EVAL_INTERVAL:-100}" + "--eval-iters=${EVAL_ITERS:-20}" + "--save-interval=${SAVE_INTERVAL:-50}" + "--log-interval=${LOG_INTERVAL:-1}" + "--save=${SAVE:-${CKPT_DIR}}" + "--load=${LOAD:-${CKPT_DIR}}" + "--seq-length=${SEQ}" + "--num-layers=${NLAYERS}" + "--hidden-size=${HIDDEN}" + "--train-iters=${TRAIN_ITERS}" + "--distributed-backend=${BE}" + "--weight-decay=${WEIGHT_DECAY:-0.1}" + "--adam-beta1=${ADAM_BETA1:-0.9}" + "--adam-beta2=${ADAM_BETA2:-0.95}" + "--adam-eps=${ADAM_EPS:-0.00001}" + "--clip-grad=${CLIP_GRAD:-1.0}" + "--num-attention-heads=${HEADS}" + "--data-cache-path=${data_cache_path}" + "--data-file-list=${DATA_FILE_LIST:-${dfl_fallback}}" + ) + # "--adam-eps ${ADAM_EPS:-0.00001}" + cache_dir="${PBS_O_WORKDIR}/.cache/" + mkdir -p "${cache_dir}" + targs_cache="${cache_dir}/train_args.txt" + for arg in "${train_args[@]}"; do echo "${arg}" >>"${targs_cache}"; done + export TRAIN_ARGS=("$(printf '%s\n' "${train_args[@]}" | sort)") + printf "Training Arguments: %s\n" "${TRAIN_ARGS[@]}" + export run_cmd=("${LAUNCHER}" "${train_args[@]}") +} + +save_dotenv() { + if [[ "$#" -ne 1 ]]; then + estr="[error]" + printf "%s Expected one argument (outdir). Received: %s" "$(printRed "${estr}")" "$#" + else + outdir="$1" + mkdir -p "${outdir}" + module list + dotenv_file="${outdir}/.env" + echo "Saving environment to ${dotenv_file}" + printenv | grep -v "LS_COLORS" >"${dotenv_file}" + export DOTENV_FILE="${dotenv_file}" + fi +} + +###################################################################### +# get_machine_name: +# +# Return current machine name, as lowercase string +# +# Example: +# ```bash +# $ machine_name=$(get_machine_name) +# $ echo "machine_name: ${machine_name}" +# ``` +###################################################################### +get_machine_name() { + if [[ $(hostname) == x4* || $(hostname) == aurora* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* || $(hostname) == uan* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* || $(hostname) == polaris* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + machine=$(hostname) + fi + echo "${machine}" +} + +get_machine() { + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + echo "Unknown MACHINE. Setting MACHINE to $(hostname) and continuing..." + fi + export MACHINE="${machine}" + printf "Running on: %s\n" "$(printBlue "${MACHINE}")" +} + +check_and_kill_if_running() { + RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') + if [[ -n "${RUNNING_PIDS}" ]]; then + echo "Caught ${RUNNING_PIDS}" && kill "${RUNNING_PIDS}" + else + echo "Not currently running. Continuing!" + fi +} + +setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$((NHOSTS * NGPU_PER_HOST))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + +printJobInfo() { + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}" + echo "- Using $(which python3)" + echo "- WORLD_SIZE:${WORLD_SIZE-}" + echo "- BACKEND: ${BE:-}" + echo "- MODEL_TYPE: ${MODEL_TYPE:-}" + echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST:-}" + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +} + +############################################################################# +# setupLauncher: Launch with one of `{mpiexec, deepspeed}`. +# +# Explicitly, look for `LAUNCH_CMD` in environment and launch accordingly. +# Will use `mpiexec` by default. +# To launch with `deepspeed` instead, specify `LAUNCH_CMD=deepspeed`, e.g. +# +# ```bash +# PBS_O_WORKDIR=$(pwd) LAUNCH_CMD=deepspeed bash train_llama_alcf.sh +# ``` +# +# will launch with `deepspeed` instead of `mpiexec`. +############################################################################# +setupLauncher() { + if [[ "$#" == 1 ]]; then + local dist_launcher="$1" + else + local dist_launcher="${LAUNCH_WITH:-${LAUNCH_CMD:-"MPICH"}}" + fi + if [[ "${dist_launcher}" == "deepspeed" ]]; then + # Save {PATH, LD_LIBRARY_PATH, ...} to .deepspeed_env + saveDSenv || exit + # Assert `./hostfile_deepspeed` exists + export hfds="${WORKING_DIR}/hostfile_deepspeed" + make_ds_hostfile || exit + export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + else + if [[ -n "${DIST_LAUNCH}" ]]; then + mn=$(get_machine_name) + if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then + LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}" + else + LAUNCHER="${DIST_LAUNCH} --genvall $(which python3) -Wignore ${EXEC}" + fi + export LAUNCHER="${LAUNCHER}" + else + echo "[setupLauncher][INFO]: Saving environment to: .env-${PBS_JOBID}" + printenv | tee ".env-${PBS_JOBID}" + echo "[setupLauncher][ERROR]: DIST_LAUNCH not found in environment !!" + fi + fi + printf "Launching with: %s\n" "$(printRed "${dist_launcher}")" + printf " %s" "$(printMagenta "${LAUNCHER}")" +} + +# set_lr_args() { +# export LR=${LR:-0.0002} # LEARNING_RATE +# export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP +# export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS +# LR_ARGS="--lr ${LR} --lr-decay-style cosine" +# if [[ -n "${LR_DECAY_ITERS:-}" ]]; then +# LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}" +# fi +# if [[ -n "${LR_WARMUP_FRAC}" ]]; then +# LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}" +# fi +# echo "LR_ARGS: ${LR_ARGS}" +# export LR_ARGS="${LR_ARGS}" +# } + +######################################################################### +# `get_batch_size_on_polaris`: Identify MICRO_BATCH to use on Polaris. +# +# - In particular, it seems that different node counts allow for different +# `MICRO_BATCH` sizes. +# +# Explicitly: +# +# - [1 <= NHOSTS <= 2]: `MICRO_BATCH=1` +# - [3 <= NHOSTS <= 7]: `MICRO_BATCH=2` +# - [8 <= NHOSTS]: `MICRO_BATCH=4` +# +# are the largest batch sizes that fit in memory at various node counts. +######################################################################### +get_batch_size_on_polaris() { + if [[ $(hostname) == x3* ]]; then + nhosts=$(wc -l <"${HOSTFILE:-${PBS_NODEFILE}}") + if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then + mbs=1 + elif [[ "${nhosts}" -ge 3 && "${nhosts}" -le 7 ]]; then + mbs=2 + elif [[ "${nhosts}" -ge 8 ]]; then + mbs=4 + fi + fi + echo "${mbs}" +} + +_get_num_hosts_from_hostfile() { + if [[ "$#" == 1 ]]; then + if [[ -f "$1" ]]; then + nhosts=$(wc -l <"$1") + echo "${nhosts}" + else + exit 1 + fi + else + exit 1 + fi +} + +########################################### +# get_grad_acc_steps_on_aurora +# +# NOTE: +# We use different numbers of gradient +# accumulation steps (GAS) depending +# on the number of hosts in our job. +# +# Each host has: +# +# [2 tiles] x [6 xpus / tile] = 12 xpus +# +# | nnhosts | nhosts | GAS | +# |:---------------:|:----------:|:-----:| +# | 256 <= n < inf | [256, inf) | 1 | +# | 128 <= n < 256 | [128, 256) | 2 | +# | 32 <= n < 128 | [32, 128) | 4 | +# | 16 <= n < 32 | [16, 32) | 8 | +# | 0 <= n < 16 | [0, 16) | 16 | +# +########################################### +get_grad_acc_steps_on_aurora() { + if [[ "$#" == 0 ]]; then + hf="${HOSTFILE:-${PBS_NODEFILE:-$(ezpz_get_pbs_nodefile_from_hostname)}}" + elif [[ "$#" == 1 ]]; then + hf="$1" + else + echo "Usage: get_grad_acc_steps_on_aurora" + echo "Expected exactly 0 or 1 arguments, received: $#" + exit 1 + fi + nhosts=$(wc -l <"${hf}") + if [[ "${nhosts}" -gt 256 ]]; then + gas=1 + elif [[ 128 -le "${nhosts}" && "${nhosts}" -lt 256 ]]; then + gas=2 + elif [[ 32 -le "${nhosts}" && "${nhosts}" -lt 128 ]]; then + gas=4 + elif [[ 16 -le "${nhosts}" && "${nhosts}" -lt 32 ]]; then + gas=8 + else + gas=16 + fi + echo "${gas}" +} + +set_ccl_vars_on_aurora() { + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export CCL_CONFIGURATION=cpu_gpu_dpcpp + # export CCL_ROOT=/tmp/oneccl/ + # export LD_LIBRARY_PATH=${CCL_ROOT}/lib:$LD_LIBRARY_PATH + # export CPATH=${CCL_ROOT}/include:$CPATH + # export LIBRARY_PATH=${CCL_ROOT}/lib:$LIBRARY_PATH + export CCL_KVS_CONNECTION_TIMEOUT=3600 + export FI_CXI_RX_MATCH_MODE=hybrid + export CCL_BCAST=double_tree + + export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 + export CCL_PROCESS_LAUNCHER=pmix # Required by Aurora mpich + export FI_PROVIDER=cxi # Required by Aurora mpich + export PALS_PMI=pmix # Required by Aurora mpich + export CCL_ATL_TRANSPORT=mpi # Required by Aurora mpich + export TORCH_LLM_ALLREDUCE=1 + export CCL_SYCL_ESIMD=1 + export CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD=0 # Required by current oneCCL (MLSL-2881) + export CCL_ENABLE_SYCL_KERNELS=1 + export CCL_WORKER_AFFINITY=5,13,21,29,37,45,57,65,73,81,89,97 + export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=32768 + export FI_CXI_DEFAULT_CQ_SIZE=1048576 + export FI_CXI_RX_MATCH_MODE=hybrid + export CCL_BCAST=double_tree +} + +############################################################################## +# setParams +# +# Set / configure run options by parsing environment. +# +# - any of the declared options below can be overridden +# dynamically at runtime, e.g. to run with a `MICRO_BATCH` size of 2: +# ```bash +# $ PBS_O_WORKDIR=$(pwd) MICRO_BATCH=2 bash train_llama_alcf.sh +# ``` +############################################################################## +setParams() { + FLASH_ARG="" + # ---- [Parallelism Settings] -------------------------------------------+ + # ------ [Aurora] -------||------ [SunSpot] ------------- + # if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then + mn=$(get_machine_name) + if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then + TP=${TP:-1} # TP = 1 + export SAVE_INTERVAL="${SAVE_INTERVAL:-50}" + export CCL=${CCL:-ccl} # CCL + export BE="${CCL}" # COMMUNICATION BACKEND = CCL + export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + # export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT_ACC_STEPS + gas=$(get_grad_acc_steps_on_aurora "${PBS_NODEFILE:-${HOSTFILE:-${hostfile}}}") + export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-${gas}}" + # export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-$(get_grad_acc_steps_on_aurora "$@)}" + echo "[setParams] Using GRAD_ACC_STEPS: ${GRAD_ACC_STEPS}" + MICRO_BATCH=${MICRO_BATCH:-1} + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-builder" + fi + #### [sam: 08/17/2024] ########################################## + # Use best set of CCL env vars from Gordon Bell runs on Aurora + set_ccl_vars_on_aurora + ################################################################# + #### [sam: 06/20/2024] ############################################### + # export CCL_PROCESS_LAUNCHER=pmix + # export CCL_ATL_TRANSPORT=mpi + # !XXX: USE KEY VALUE STORE FIX ON AURORA [2024-06-20] + # use_kvs_fix_on_aurora # <-- why are these different from those in update_ccl_env_vars_aurora ?? + # update_ccl_env_vars_aurora + ###################################################################### + # if [[ -z "${USE_FLASH_ATTN:-}" ]]; then + # # NOTE: if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! + # export NO_FLASH_ATTN=1 # disabled on [2024-06-20] waiting on fix... + # if [[ -n "${NO_FLASH_ATTN-}" ]]; then + # echo "Not using flash-attn!!" + # else + # FLASH_ARG="--use-flash-attn-builder" + # fi + # else + # echo "Using flash-attn !!" + # FLASH_ARG="--use-flash-attn-builder" + # fi + # [Polaris] + elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" ]]; then + # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" + TP=${TP:-1} # TP = 2 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? + export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS + # NOTE: MICRO_BATCH is exported below + # MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-v2" + fi + echo "Setting up AWS NCCL OFI Plugin on Polaris..." + source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit + # [Perlmutter] + elif [[ "${mn}" == login* || "${mn}" == nid* ]]; then + TP="${TP:-2}" + export NCCL="${NCCL:-nccl}" + export BE="${NCCL}" + export DTYPE="${DTYPE:-bf16}" + MICRO_BATCH="${MICRO_BATCH:-1}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + FLASH_ARG="--use-flash-attn-v2" + fi + fi + export TP="${TP}" + export PP="${PP:-1}" + export SP="${SP:-1}" + export FLASH_ARG="${FLASH_ARG}" + export DTYPE="${DTYPE:-bf16}" + export OPT="${OPT:-adamw}" + export WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}" + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + NHOSTS=$(wc -l <"${HOSTFILE}") + if [[ -z "${NGPU_PER_HOST:-}" ]]; then + NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + fi + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export WORLD_SIZE="${WORLD_SIZE:-$((NHOSTS * NGPU_PER_HOST))}" + # +---[Llama2 7B Config]--------------------------------------------------+ + # export MODEL_KEY="Llama-7B" + export HEADS=${HEADS:-${NHEADS:-32}} # NUMBER OF ATEN HEADS + export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} # NUMBER OF LAYERS + export HIDDEN=${HIDDEN:-4096} # HIDDEN SIZE + export NUM_KV_HEAD=${NUM_KV_HEAD:-8} # GROUP ATTENTION + export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # FFN HIDDEN SIZE + # +---[Run Settings]------------------------------------------------------+ + export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 + export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE + export MICRO_BATCH=${MICRO_BATCH:-1} # MICRO BATCH SIZE + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export ACT_CKPT_NUM_LAYERS="${ACT_CKPT_NUM_LAYERS:-1}" # NUM LAYERS TO CHECKPOINT ACTIVATIONS + export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-} # USE ACTIVATION CHECKPOINTING ? + export GLOBAL_BATCH_MAX=$((WORLD_SIZE * MICRO_BATCH * GRAD_ACC_STEPS / TP / PP / SP)) # MAX GLOBAL BATCH SIZE + export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT + if [[ -n "${TRAIN_TOKENS:-}" ]]; then + export TRAIN_TOKENS="${TRAIN_TOKENS}" + export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH)) + printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))" + printf "TRAIN_ITERS=%s\n" "${TRAIN_ITERS}" + elif [[ -z "${TRAIN_ITERS:-${TRAIN_ITER:-}}" ]]; then + export TRAIN_TOKENS=${TRAIN_TOKENS:-2000000000000} + export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH)) + printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))" + printf "TRAIN_ITERS=%s\n" "${TRAIN_ITERS}" + else + export TRAIN_ITERS="${TRAIN_ITERS:-${TRAIN_ITER:-}}" + fi + export MODEL_TYPE="llama-gb${GLOBAL_BATCH}-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL + # NOTE: [2024-07-10] ##################################################### + # - [sam]: For whatever reason, it seems that using + # sequence-parallelism (SP) > 1 is INCOMPATIBLE with + # rotary-position-embeddings (ROPE). + # + # For this reason, we only use the default LLAMA_ARGS when SP=0. + ########################################################################## + # # -----[Learning Rate Settings]-------------------------------------------- + # export LR=${LR:-0.0002} # LEARNING_RATE + # export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS + # set_lr_args + # -----[Learning Rate Settings]-------------------------------------------- + # # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then + # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then + # TIMING_STR="\ + # --timing-log-level ${TIMING_LOG_LEVEL}" + # # " + # else + # TIMING_STR="" + # fi +} + +############################################## +# set_args +# +# Specify additional (DeepSpeed specific) +# arguments to pass to pretrain_gpt_alcf.py +############################################## +set_args() { + # ---- Set DeepSpeed arguments -------------------------------- + ds_args=( + "--deepspeed" + ) + if [[ "${PP:-1}" == 1 ]]; then + ds_args+=("--no-pipeline-parallel") + fi + ds_args+=("--deepspeed_config=${DS_CONFIG}") + ds_args+=("--zero-stage=$ZERO_STAGE") + if [[ "${ZERO_STAGE}" == 3 ]]; then + ds_args+=("--use-mics") + fi + # ds_args=" " + # ds_args=" --deepspeed ${ds_args}" + # if [[ $PP == 1 ]]; then + # ds_args=" --no-pipeline-parallel ${ds_args}" + # fi + # ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + # ds_args="--zero-stage=$ZERO_STAGE ${ds_args}" + # if [[ "${ZERO_STAGE}" == 3 ]]; then + # ds_args="--use-mics ${ds_args}" + # fi + if [[ -n "${USE_ACTIVATION_CHECKPOINTING:-}" ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args+=("--deepspeed-activation-checkpointing") + # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing + fi + export ds_args + # --------------------------------------------------------------- + gpt_args=() + # we are now using activation checkpoint provided by megatron, see below. + # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers ${ACT_CKPT_NUM_LAYERS}" + ) + fi + export gpt_args +} + +make_ds_hostfile() { + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" + # ---- Make MPICH hostfile ---------------- + hf="${HOSTFILE:-${PBS_NODEFILE}}" + export hostfile_mpich=hostfile_mpich + cat "${hf}" >"${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "${hf}" >"${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + +########################################### +# ezpz_setup +# +# 1. Clone [`saforem2/ezpz`](https://github.com/saforem2/ezpz) (if necessary) +# to `"${WORKING_DIR}/deps/ezpz/"` +# +# 2. Source [`ezpz/src/ezpz/bin/utils.sh`](https://github.com/saforem2/ezpz/blob/main/src/ezpz/bin/utils.sh) +# - This provides `{ezpz_setup_python, ezpz_setup_job}` (called below) +# +# 3. Call `ezpz_setup_python` (from `ezpz/bin/utils.sh`): +# - This will setup conda + virtual enviroment +# +# 4. Call `ezpz_setup_job` (from `ezpz/bin/utils.sh`): +# - This will parse `$PBS_*` variables and build launch cmd +# +# 3. Call `_ezpz_install` (from `Megatron-DeepSpeed/ALCF/helpers.sh`): +# - Install ezpz from `"${WORKING_DIR}/depz/ezpz/"` +########################################### +ezpz_setup() { + # setup_alcf "$@" + # file=$(mktemp) + # curl -Ls https://raw.githubusercontent.com/saforem2/ezpz/main/src/ezpz/bin/getjobenv > "${file}" + # shellcheck source=../deps/ezpz/src/ezpz/bin/utils.sh + ezdir="${WORKING_DIR}/deps/ezpz" + if [[ -d "${ezdir}" ]]; then + echo "Found ezpz in ${ezdir}" + else + mkdir -p "$(dirname "${ezdir}")" + git clone https://github.com/saforem2/ezpz "${ezdir}" + fi + # shellcheck source=../deps/ezpz/src/ezpz/bin/utils.sh + source "${ezdir}/src/ezpz/bin/utils.sh" || exit + ezpz_setup_python + ezpz_setup_job "$@" + ezpz_pip_loc=$(python3 -m pip list | grep ezpz | awk '{print $NF}') + if [[ -z "${ezpz_pip_loc:-}" ]]; then + printf "[ezpz_install] Installing ezpz from %s\n" "${ezdir}" + python3 -m pip install -e "${ezdir}" --require-virtualenv + else + printf "[ezpz_install] Found ezpz @ %s\n" "${ezpz_pip_loc}" + fi +} + +####################################################################### +# ezpz_test: Run simple test to make sure all nodes in working order +####################################################################### +ezpz_test() { + printf "%s" "[$(printBlue 'ezpz:test_dist')][INFO] Running ezpz.test_dist...\n" + # [ -n "${PBS_O_WORKIR}" ] && ezpz_savejobenv || ezpz_getjobenv + # python3 -Wignore -m ezpz.jobs && source "${PBS_O_WORKDIR}/.jobenv" + printf "%s" "[$(printBlue 'ezpz:test_dist')] Running test: ${eztest}\n" + eztest="TRAIN_ITERS=50 ${LAUNCH_CMD} python3 -Wignore -m ezpz.test_dist" + eval "${eztest}" + printf "%s" "[$(printBlue 'ezpz:test_dist')] Done with test!\n" +} + +############################################################################ +# saveDSenv +# +# Save important environment variables to .deepspeed_env, which will be +# forwarded to ALL ranks with DeepSpeed +############################################################################ +saveDSenv() { + echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" + { + echo "PATH=${PATH}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + echo "http_proxy=${http_proxy:-}" + echo "https_proxy=${https_proxy:-}" + echo "CFLAGS=${CFLAGS}" + echo "PYTHONUSERBASE=$PYTHONUSERBASE" + } >.deepspeed_env +} + +get_output_prefix() { + # ---- Specify output location -------------------------------- + pre="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" + pre="${pre}_hs${HIDDEN}_mb${MICRO_BATCH}" + pre="${pre}_seq${SEQ}_gb${GLOBAL_BATCH}" + pre="${pre}_sp${SP}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}" + if [[ -n "${TOKENIZER_TYPE:-}" ]]; then + _tok=$(echo "${TOKENIZER_TYPE}" | sed 's/Tokenizer//g') # noqa + pre="${pre}_tok${_tok}" + fi + if [[ -n "${LR_DECAY_ITERS}" ]]; then + pre="${pre}_ldi${LR_DECAY_ITERS}" + fi + if [[ -z "${NO_FLASH_ATTN:-}" ]]; then + pre="${pre}_flash" + fi + export OUTPUT_PREFIX="${pre}" + echo "${pre}" +} + +setOutput() { + # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + OUTPUT_PREFIX=$(get_output_prefix) + OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" + export OUTPUT_DIR="${OUTPUT_DIR}" && mkdir -p "${OUTPUT_DIR}" + export OUTPUT_LOG="${OUTPUT_DIR}/output.log" + echo "${OUTPUT_LOG}" >>"logs/latest" + printf "\n Please see logs at: %s\n" "$(printGreen "${OUTPUT_DIR}")" +} + +get_checkpoint_dir() { + if [[ -n "${CKPT_DIR:-}" ]]; then + echo "${CKPT_DIR}" + else + echo "checkpoints/$(get_output_prefix)" + fi +} + +setup_checkpoint() { + ckpt_dir=$(get_checkpoint_dir) + export CKPT_DIR="${ckpt_dir}" + printf "Checkpoints will be saved to: %s\n" "$(printYellow "${CKPT_DIR}")" +} + +############################################# +# Build DeepSpeed config and write to .json +############################################# +buildDSconfig() { + # export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" + export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + mkdir -p "$(dirname "${DS_CONFIG}")" + echo "DS_CONFIG: ${DS_CONFIG}" + printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" + generateDSconfig "${DS_CONFIG}" + cat "${DS_CONFIG}" | jq . +} + +############################################################################### +# sumWeights +# +# This will sum the weights (first column) from each line in the passed +# `file_list`. +############################################################################### +sumWeights() { + local file_list=$1 + weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") + python3 -c "import numpy as np; print(np.sum(${weights}))" +} + +sumFiles() { + local rd=$1 + for f in $("${rd}/*.txt"); do + ws=$(sumWeights "${rd}/${f}") + echo "sum($f.weights)=${ws}" + done +} + +########################################### +# make_data +# +# This will run `make` in `megatron/data` +# prior to launching, ensuring that +# `megatron/data/helpers.cpp` +# is built appropriately. +########################################### +make_data() { + python3 -m pip install pybind11 + mdir="${WORKING_DIR}/megatron/data" + cd "${mdir}" && make && cd - +} + +############################################################################## +# install_dependencies +# +# Ensure all dependencies installed from `ALCF/requirements/requirements.txt` +############################################################################## +install_dependencies() { + depsfile="${WORKING_DIR}/ALCF/requirements/requirements.txt" + echo "[install_dependencies] Ensuring all dependencies from ${depsfile} installed..." + python3 -m pip install -r "${depsfile}" --require-virtualenv 1>/dev/null + if [[ ! -x "$(command -v deepspeed)" ]]; then + mn=$(get_machine_name) + # if [[ "${mn}" == aurora* || "${mn}" == sunspot* ]]; then + # install_deepspeed_for_xpu || exit + # fi + printf "[install_dependencies] No 'deepspeed' command found on %s" "${mn}" + printf "[install_dependencies] !! No deepsepeed in %s" "$(which python3)" + fi +} + +################################################# +# Fix for distributed key value store on Aurora +################################################# +use_kvs_fix_on_aurora() { + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH + ######################################################### + # if not set, CCL will complain... ? + export NUMEXPR_MAX_THREADS="${NUMEXPR_MAX_THREADS:-16}" + ######################################################### +} + +update_ccl_env_vars_aurora() { + # export CCL_KVS_MODE=mpi + # # export CCL_CONFIGURATION_PATH="" + # # unset CCL_CONFIGURATION_PATH + # # export CCL_CONFIGURATION=cpu_gpu_dpcpp + # # export CCL_ROOT="/flare/Aurora_deployment/intel/ccl/_install_release_2021_13" + # export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + # export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + # export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH + # # export CCL_ALLREDUCE_SCALEOUT=direct + # printenv | grep -E -v "^__" | grep -E "CCL|LD|CPATH|LIBRARY_PATH" + ######################################################### + # if not set, CCL will complain... ? + export NUMEXPR_MAX_THREADS="${NUMEXPR_MAX_THREADS:-16}" + ######################################################### + # Sam: [2024-06-29] + export CCL_KVS_MODE=mpi + export CCL_CONFIGURATION_PATH="" + export CCL_CONFIGURATION=cpu_gpu_dpcpp + export CCL_ROOT="/flare/Aurora_deployment/intel/ccl/_install_release_2021_13" + export LD_LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LD_LIBRARY_PATH + export CPATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/include:$CPATH + export LIBRARY_PATH=/flare/Aurora_deployment/intel/ccl/_install_release_2021_13/lib:$LIBRARY_PATH +} + +########################################################## +# Check that we can find the `.py` file we wish to launch +########################################################## +check_executable() { + fp=$1 + if [[ -f "${fp}" ]]; then + export EXEC="${fp}" + # ----[1.5 Keep track of stem from file path]------------------------- + exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") + export EXEC_STEM="${exec_stem}" + else + estr="Unable to locate executable ${fp}" + printf "[ALCF.helpers:check_executable] %s\n" "$(printRed "${estr}")" + fi +} + +###################################################################### +# `makeHostiles`: +# Detect if `HOSTFILE` set in active environment. +# - If so, use this. +# - Otherwise, make default HOSTFILEs from "${PBS_NODEFILE}" +###################################################################### +makeHostfiles() { + if [[ -n "${HOSTFILE}" ]]; then + printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" + else + make_ds_hostfile + fi +} + +################################################## +# Setup tokenizer as either Llama2 or GPT2 style +################################################## +setup_tokenizer_and_data() { + if [[ "$#" == 1 ]]; then + tok="$1" + dfl="${DATA_FILE_LIST:-}" + elif [[ "$#" == 2 ]]; then + tok="$1" + dfl="$2" + else + echo "Incorrect number of arguments passed. Received: $#, expected 2" + fi + echo "Setting up tokenizer with ${tok}" + echo "Using data_file_list: ${dfl}" + _data_flags=() + _tokenizer_flags=() + if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + export TOKENIZER_TYPE="GPT2" + _tokenizer_flags+=("--tokenizer-type GPT2BPETokenizer") + machine=$(get_machine_name) + if [[ ${machine} == "polaris" ]]; then + export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "sunspot" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "aurora" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gecko/Aurora_deployment/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + else + export DATA_PARENT="${DATA_PARENT:-${WORKING_DIR}/dataset}" + fi + export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + _data_flags+=( + "--data-path ${DATA_PATH}" + "--vocab-file ${VOCAB_FILE}" + "--merge-file ${MERGE_FILE}" + ) + else + export TOKENIZER_TYPE="${TOKENIZER_TYPE:-Llama2Tokenizer}" + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + _tokenizer_flags+=( + "--tokenizer-type ${TOKENIZER_TYPE}" + "--tokenizer-model ${TOKENIZER_MODEL}" + ) + # if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then + echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST:-}" + setData "${dfl}" || exit + fi + export DATA_FLAGS="${_data_flags[*]}" + export TOKENIZER_FLAGS="${_tokenizer_flags[*]}" + printf "[setData] DATA_FLAGS: %s\n" "$(printGreen "${DATA_FLAGS}")" + printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta "${TOKENIZER_FLAGS}")" +} + +############################################### +# setData +# +# Ensure `DATA_FILE_LIST` is set, +# fallback to default values if necessary. +############################################### +setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] + ####### [Set DATA_FILE_LIST_FALLBACK based on current machine] ############# + mn=$(get_machine_name) + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/${mn}/dolma.txt" + ############################################################################ + # set `dfl` to `dfl_fallback` if not passed as an argument, + # use this data file list to call `setData` + dfl="${1:-${dfl_fallback}}" + printf "Calling: setData() with %s\n" "${dfl}" + ndocs=$(wc -l <"${dfl}") + ws=$(sumWeights "${dfl}") + dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + dcp=".cache/${dfl_stem}/index-cache" + export DATA_FILE_LIST="${dfl}" + export NUM_DOCS="${ndocs}" + export WEIGHT_SUM="${ws}" + export DFL_STEM="${dfl_stem}" + export DATA_CACHE_PATH="${dcp}" + # export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}" # --data-cache-path ${DATA_CACHE_PATH}" + echo "--------------------" + echo "Updated environment:" + printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" + printf "NUM_DOCS: %s\n " "${NUM_DOCS}" + printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" + printf "DFL_STEM: %s\n" "${DFL_STEM}" + printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + printf "DATA_FLAGS: %s\n" "${DATA_FLAGS}" + echo "--------------------" +} + +generateDSconfig_new() { + cat <"${CONFIG_JSON}" + { + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + + "zero_optimization": { + "stage": $ZERO_STAGE + }, + + "bf16": { + "enabled": true + }, + + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "wall_clock_breakdown" : false + } +EOT +} + +################################################################################ +# generateDSconfig +# +# Create and save a deepspeed config .json +# +# This will contain the appropriate variables as set in the current environment. +################################################################################ +generateDSconfig() { + if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 + fi + for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" "$PP" "$DTYPE"; do + if [ -z "$v" ]; then + echo "Please export required envs before execute $0" + exit 1 + fi + done + # \"scheduler\": { + # \"type\": \"WarmupLR\", + # \"params\": { + # \"warmup_min_lr\": 0.00003, + # \"warmup_max_lr\": 0.0003, + # \"warmup_num_steps\": 5000 + # } + # }, + extra="" + common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"gradient_clipping\": 1.0, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"zero_force_ds_cpu_optimizer\": false, + \"zero_allow_untested_optimizer\": true, + \"wall_clock_breakdown\": false," + # if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then + # activation_checkpointing="\ + # \"activation_checkpointing\": { + # \"partition_activations\": true, + # \"contiguous_memory_optimization\": true + # }," + # fi + if [[ $DTYPE == "bf16" ]]; then + # \"communication_data_type\": \"bf16\", + dtype="\ + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," + elif [[ $DTYPE == "fp16" ]]; then + dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," + else + dtype="\"communication_data_type\": \"fp32\"," + fi + if [[ "${OPT:-}" == "ds.adamw" ]]; then + optimizer="\ + \"optimizer\": { + \"type\": \"AdamW\", + \"params\": { + \"lr\": ${LR}, + \"beta1\": ${ADAM_BETA1}, + \"beta2\": ${ADAM_BETA2}, + \"eps\": ${ADAM_EPS}, + \"weight_decay\": 1e-1 + }, + }," + elif [[ "${OPT:-}" == "ds.onebitlamb" ]]; then + optimizer="\ + \"optimizer\": { + \"type\": \"OneBitLamb\", + \"params\": { + \"lr\": 11e-3, + \"max_coeff\": 0.3, + \"min_coeff\": 0.01, + \"freeze_step\": 1000, + \"cuda_aware\": false, + \"comm_backend_name\": \"${BE}\", + \"coeff_beta\": 0.9, + \"factor_max\": 4.0, + \"factor_min\": 0.5, + \"factor_threshold\": 0.1 + } + }," + else + optimizer="" + fi + if [[ "${ZERO_STAGE}" == 3 ]]; then + # \"mics_shard_size\": 2, + zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"mics_hierarchical_params_gather\": true, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," + # elif [[ $ZERO_STAGE == 2 ]]; then + elif [[ "${ZERO_STAGE}" == 2 || "${ZERO_STAGE}" == 1 ]]; then + if [[ -n "${CPU_OPTIMIZER:-}" ]]; then + echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\" + } + }," + else + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE + }," + fi + if [[ "${PP}" -gt 1 ]]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + extra="\ + \"comms_logger\": { + \"enabled\": ${COMMS_LOGGER:-false}, + \"verbose\": false, + \"debug\": false + }," + fi + else + echo 'Please add the correct config set!!!' + fi + flops_profiler="\ + \"flops_profiler\": { + \"enabled\": true, + \"profile_step\": 2, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + cat <"$1" +{ +$common +$optimizer +$zero +$dtype +$extra +$flops_profiler +} +EOT +} + +# ##################### +# # train +# ##################### +# train() { +# # 1. Navigate into `$PBS_O_WORKDIR` <-- [should be Megatron-Deepspeed] +# cd "${PBS_O_WORKDIR}" || exit +# HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE +# # 2. source `ALCF/helpers.sh` <-- [should be ./ALCF/helpers.sh] +# source "${HERE}/ALCF/helpers.sh" || exit +# # 3. call `setup` from `./ALCF/helpers.sh` +# # export DATA_FILE_LIST="${HERE}/ALCF/data-lists/$(get_machine_name)/books.txt" +# setup || exit +# # 4. Take custom args +# export custom_args=" $@" +# # 5. Update ${run_cmd} (from setup ALCF/helpers.sh) with ${custom_args} +# export run_cmd="${run_cmd} ${custom_args}" +# # 6. Add "${run_cmd}" to output log +# echo "${run_cmd}" | tee -a "${OUTPUT_LOG}" +# # 7. Tell user where to find output +# printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" +# # 8. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} +# eval "${run_cmd}" |& tee -a "${OUTPUT_LOG}" +# set +x +# } + +############################################### +# Helper functions for printing colored text +############################################### +RESET="\e[0m" +BLACK="\e[1;30m" +RED="\e[1;31m" +GREEN="\e[1;32m" +YELLOW="\e[1;33m" +BLUE="\e[1;34m" +CYAN="\e[1;35m" +# WHITE="\e[1;36m" + +printBlack() { + printf "\e[1;30m%s\e[0m\n" "$@" +} + +printRed() { + printf "\e[1;31m%s\e[0m\n" "$@" +} + +printGreen() { + printf "\e[1;32m%s\e[0m\n" "$@" +} + +printYellow() { + printf "\e[1;33m%s\e[0m\n" "$@" +} + +printBlue() { + printf "\e[1;34m%s\e[0m\n" "$@" +} + +printMagenta() { + printf "\e[1;35m%s\e[0m\n" "$@" +} + +printCyan() { + printf "\e[1;36m%s\e[0m\n" "$@" +} + +printWhite() { + printf "\e[1;37m%s\e[0m\n" "$@" +} + +reset_env() { + custom_vars=( + NO_FLASH_ATTN + USE_FLASH_ATTN + TP + PP + SP + FLASH_ARG + OPT + ADAM_BETA1 + ADAM_BETA2 + ADAM_EPS + WEIGHT_DECAY + HEADS + NLAYERS + HIDDEN + NUM_KV_HEAD + FFN_HIDDEN_SIZE + SEQ + ZERO_STAGE + MICRO_BATCH + EVAL_ITERS + EVAL_INTERVAL + TIMING_LOG_LEVEL + ACT_CKPT_NUM_LAYERS + USE_ACTIVATION_CHECKPOINTING + GLOBAL_BATCH_MAX + GLOBAL_BATCH + TRAIN_TOKENS + TRAIN_ITERS + MODEL_TYPE + LR + LR_WARMUP_FRAC + LR_DECAY_ITERS + LR_ARGS + CPU_OPTIMIZER + DS_CONFIG + OUTPUT_DIR + OUTPUT_LOG + CKPT_DIR + ds_args + EXEC + EXEC_STEM + DATA_FLAGS + TOKENIZER_TYPE + TOKENIZER_MODEL + TOKENIZER_FLAGS + DATA_FILE_LIST + NUM_DOCS + WEIGHT_SUM + DFL_STEM + DATA_CACHE_PATH + DOTENV_FILE + YEAR + MONTH + DAY + TODAY + STARTED_AT + LAUNCHER + data_cache_path + DEFAULTS + ) + # LLAMA_ARGS + printf "Unsetting custom vars: %s\n" "${custom_vars[*]}" + unset "${custom_vars[@]}" +} + +convert_ckpt_to_universal() { + if [[ "$#" -ne 1 ]]; then + echo "Usage: convert_ckpt_to_universal ckpt_dir" + echo "Expected one argument (ckpt_dir), received: $#" + exit 1 + fi + ckptdir=$1 + gs=$(cat "${ckptdir}/latest_checkpointed_iteration.txt") + src="${ckptdir}/global_step${gs}" + dst="${ckptdir}/global_step${gs}_universal" + convert_script="${PBS_O_WORKDIR}/deps/DeepSpeed/checkpoint/ds_to_universal.py" + python3 "${convert_script}" --input_folder "${src}" --output_folder "${dst}" +} + +########################### +# call helpers_main() +########################### +helpers_main diff --git a/ALCF/notes/checkpoints.md b/ALCF/notes/checkpoints.md new file mode 100644 index 0000000000..f5acd425a7 --- /dev/null +++ b/ALCF/notes/checkpoints.md @@ -0,0 +1,207 @@ +# Converting Checkpoints + +## Megatron $\rightarrow$ 🤗 HuggingFace + +On Aurora, + +- Setup: + + ```bash + CKPT_ROOT="/flare/Aurora_deployment/AuroraGPT-Testing/foremans/rollback-41k8/Megatron-DeepSpeed-41800/checkpoints/ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.00020_lwf0.05"; + + LAST_STEP=$(cat "${CKPT_ROOT}/latest_checkpointed_iteration.txt") + GLOBAL_STEP="${GLOBAL_STEP:-${LAST_STEP}}" + + SRC="${CKPT_ROOT}/global_step${GLOBAL_STEP}" + + OUTPUT_PARENT="/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF" + DST="${OUTPUT_PARENT}/global_step${GLOBAL_STEP}_hf" + + printf "SRC: %s\n DST: %s\n" "${SRC}" "${DST}" + ``` + +- Convert: + + ```bash + python3 Megatron-DeepSpeed/mds_to_hf.py \ + --mds_checkpoint "${SRC}/mp_rank_00_model_states.pt" \ + --output_dir "${DST}" \ + --cache_dir "./.cache" + ``` + + + + + + + + + + [DST] --> + + + + + + + + + + + +## Use in 🤗 `transformers` + +```python +from pathlib import Path +import time +from rich import print +from typing import Optional +from transformers import LlamaForCausalLM, AutoTokenizer + +def load_model(ckpt_dir: str, step: Optional[int] = None): + if step is None: + fp = Path(ckpt_dir) + else: + fp = Path(ckpt_dir).joinpath(f"global_step{step}_hf") + print(f"Loading ckpt from: {fp}") + if fp.exists(): + model = LlamaForCausalLM.from_pretrained(fp.as_posix()) + print(f"{model=}") + return model + + raise FileNotFoundError(f"Unable to locate checkpoint at: {fp}") + + +def eval_model( + model: torch.nn.Module, + max_length: int = 64, + prompt: Optional[str] = None, + tokenizer: Optional[AutoTokenizer] = None, +) -> str: + prompt = "What is it like in there?" if prompt is None else prompt + tokenizer = ( + AutoTokenizer.from_pretrained("meta-llama/Llama-2-7B-hf") + if tokenizer is None else tokenizer + ) + output = ( + tokenizer.batch_decode( + model.generate( + **tokenizer(prompt, return_tensors="pt"), + max_length=max_length, + ), + clean_up_tokenization_spaces=True, + skip_special_tokens=True, + )[0] + ) + return output + + +def loop_over_checkpoints( + steps_list: list[int], + ckpt_dir: str, + max_length: int = 128, + prompt: Optional[str] = None, +): + for step in steps_list: + t0 = time.perf_counter() + prompt = "What is it like in there?" if prompt is None else prompt + print(f"\n Loading model from checkpoint at global step: {step}") + outputs = eval_model( + load_model(step, ckpt_dir), + max_length=max_length, + prompt=prompt, + ) + print(f"{outputs}") + print(f"\ntook: {time.perf_counter() - t0:.6f}s\n") +``` + +```python +>>> ckpt_dir = "/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF/" +>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7B-hf") +>>> model = load_model(76300, ckpt_dir) +Loading ckpt from: +/flare/Aurora_deployment/AuroraGPT-Checkpoints/production-checkpoints/aGPT-7B/HF/global_step76300_hf +model=LlamaForCausalLM( + (model): LlamaModel( + (embed_tokens): Embedding(32000, 4096) + (layers): ModuleList( + (0-31): 32 x LlamaDecoderLayer( + (self_attn): LlamaSdpaAttention( + (q_proj): Linear(in_features=4096, out_features=4096, bias=False) + (k_proj): Linear(in_features=4096, out_features=1024, bias=False) + (v_proj): Linear(in_features=4096, out_features=1024, bias=False) + (o_proj): Linear(in_features=4096, out_features=4096, bias=False) + (rotary_emb): LlamaRotaryEmbedding() + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=4096, out_features=11008, bias=False) + (up_proj): Linear(in_features=4096, out_features=11008, bias=False) + (down_proj): Linear(in_features=11008, out_features=4096, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((4096,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + (lm_head): Linear(in_features=4096, out_features=32000, bias=False) +) + +>>> print( +... eval_model( +... model, +... max_length=128, +... prompt="What is it like in there?", +... tokenizer=tokenizer +... ) +... ) +Setting `pad_token_id` to `eos_token_id`:None for open-end generation. +Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, whereit will always be FP32) +What is it like in there? +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in there a few times. It's a pretty cool place. +I've been in +``` + +## Helper Script + +```bash +convert_mds_to_hf() { + if [[ "$#" -eq 3 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT=$2 + OUTPUT_PARENT=$3 + elif [[ "$#" -eq 2 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT=$2 + OUPUT_PARENT=$(pwd) + elif [[ "$#" -eq 1 ]]; then + GLOBAL_STEP=$1 + CKPT_ROOT="/flare/Aurora_deployment/AuroraGPT-Testing/foremans/rollback-41k8/Megatron-DeepSpeed-41800/checkpoints/ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.00020_lwf0.05/"; + OUPUT_PARENT=$(pwd) + else + echo "Expected exactly 1, 2, or 3 arguments (global_step, src, dst, respectively)" + exit + fi + SRC="${CKPT_ROOT}/global_step${GLOBAL_STEP}" + DST="${OUTPUT_PARENT}/global_step${GLOBAL_STEP}_hf" + if [[ -d "${SRC}" ]]; then + echo "Converting checkpoint @ global step ${GLOBAL_STEP}" + echo "\tsrc = ${SRC}\n" + echo "\tdst = ${DST}\n" + python3 mds_to_hf.py \ + --mds_checkpoint "${SRC}/mp_rank_00_model_states.pt" \ + --output_dir "${DST}" \ + --cache_dir "./.cache" + else + echo "Unable to locate directory ${SRC}. Exiting" + exit 1 + fi +} +``` diff --git a/ALCF/notes/deepspeed_init_time.md b/ALCF/notes/deepspeed_init_time.md new file mode 100644 index 0000000000..a355a082a9 --- /dev/null +++ b/ALCF/notes/deepspeed_init_time.md @@ -0,0 +1,269 @@ +# DeepSpeed Initialization Time on Aurora + +## TODOs + +- [ ] Use `ZeRO={1, 2}` @ 256 Nodes of Aurora +- [ ] Figure out bottleneck in startup time on Aurora +- [ ] Use GAS=8 on Aurora +- [ ] Weight decay too high +- [ ] Save checkpoints every ~ 1 hr +- [ ] Write weekly updates and post to GitHub + +## Initialization Times + +- Search for "deepspeed.initialize" in `Megatron-DeepSpeed/logs/`: + +```bash +#[🌌][11:44:57 PM][foremans@aurora-uan-0010][…/Megatron-DeepSpeed/logs][🌱 alcf-startup-time][$!?] +$ rg --hidden "deepspeed\.initialize" **/**/*.log | grep took +``` + +### Measurements + +| NUM_NODES | WORLD_SIZE | TIME | +|:---------:|:----------:|:----------:| +| 8 | 96 | 61.073 | +| | | | +| 16 | 192 | 107.74411 | +| 16 | 192 | 107.201338 | +| 16 | 192 | 107.10853 | +| | | | +| 32 | 384 | 200.23095 | +| 32 | 384 | 206.49485 | +| 32 | 384 | 200.49485 | +| | | | +| 64 | 768 | 413.55765 | +| 64 | 768 | 394.92617 | +| 64 | 768 | 414.725 | +| 64 | 768 | 387.987 | +| 64 | 768 | 411.72035 | +| 64 | 768 | 394.926 | +| 64 | 768 | 409.375 | +| 64 | 768 | 393.091 | +| 64 | 768 | 412.600 | +| | | | +| 128 | 1536 | 789.30077 | +| 128 | 1536 | 788.86531 | +| 128 | 1536 | 792.71864 | +| 128 | 1536 | 836.98 | +| 128 | 1536 | 801.205 | +| 128 | 1536 | 836.98 | +| 128 | 1536 | 820.9538 | +| 128 | 1536 | 707.048 | +| | | | +| 256 | 3072 | 1639.62374 | +| 256 | 3072 | 1591.345 | +| 256 | 3072 | 1632.12712 | +| 256 | 3072 | 1674.444 | +| 256 | 3072 | 1618.100 | + + +-
WORLD_SIZE=96: + + ```bash title="deepspeed_init_times.sh" + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ws96_ds_stage1_nl32_hs4096_mb4_seq4096_gb3072_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125717_96_x4420c5s5b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:59:19][INFO][training:795] - 'deepspeed.initialize' took: 61.07362s + ``` + +
+ +-
WORLD_SIZE = 192: + + ```bash + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-154948_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:52:30][INFO][training:795] - 'deepspeed.initialize' took: 107.74411s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb6144_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-160332_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:05:43][INFO][training:800] - 'deepspeed.initialize' took: 107.20138s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ws192_ds_stage1_nl32_hs4096_mb4_seq4096_gb768_sp1_pp1_tp1_bf16_optadamwschedulefree_lr0.0003_lwf0.05/20240623-163640_192_x4716c2s6b0n0.hostmgmt2716.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:38:52][INFO][training:800] - 'deepspeed.initialize' took: 107.10853s + ``` + +
+ +-
WORLD_SIZE = 384: + + ```bash + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-164607_384_x4402c6s7b0n0.hostmgmt2402.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:52:15][INFO][training:800] - 'deepspeed.initialize' took: 206.49485s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ws384_ds_stage1_nl32_hs4096_mb4_seq4096_gb12288_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-223159_384_x4706c1s6b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:37:53][INFO][training:800] - 'deepspeed.initialize' took: 200.23095s + ``` + +
+ +-
WORLD_SIZE=768: + + ```bash + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-233045_768_x4711c0s1b0n0.hostmgmt2711.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 23:39:19][INFO][training:797] - 'deepspeed.initialize' took: 387.98744s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-180052_768_x4704c4s1b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:12:43][INFO][training:800] - 'deepspeed.initialize' took: 394.92617s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-185626_768_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:05:45][INFO][training:800] - 'deepspeed.initialize' took: 414.72580s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize' took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-134324_768_x4705c2s1b0n0.hostmgmt2705.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:51:19][INFO][training:795] - 'deepspeed.initialize' took: 393.09134s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141802_768_x4706c2s0b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:27:50][INFO][training:795] - 'deepspeed.initialize' took: 411.72035s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-165713_768_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:06:47][INFO][training:800] - 'deepspeed.initialize' took: 389.15768s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122601_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:35:18][INFO][training:793] - 'deepspeed.initialize took: 409.37578s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124517_768_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:57:42][INFO][training:795] - 'deepspeed.initialize' took: 395.05079s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-130702_768_x4420c6s7b0n0.hostmgmt2420.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:15:22][INFO][training:795] - 'deepspeed.initialize' took: 412.60004s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-173730_768_x4707c5s6b0n0.hostmgmt2707.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:45:33][INFO][training:800] - 'deepspeed.initialize' took: 400.74402s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ws768_ds_stage1_nl32_hs4096_mb4_seq4096_gb24576_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-132452_768_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:34:32][INFO][training:795] - 'deepspeed.initialize' took: 413.55765s + ``` + +
+ +-
WORLD_SIZE = 1536: + + ```bash + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-162028_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:37:49][INFO][training:800] - 'deepspeed.initialize' took: 789.30077s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-145656_1536_x4119c5s7b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:14:35][INFO][training:795] - 'deepspeed.initialize' took: 788.86531s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-122207_1536_x4309c6s4b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 12:39:56][INFO][training:793] - 'deepspeed.initialize' took: 792.71864s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125001_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:06:47][INFO][training:795] - 'deepspeed.initialize' took: 836.98388s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-125001_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:06:47][INFO][training:795] - 'deepspeed.initialize' took: 836.98388s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-175213_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:10:54][INFO][training:800] - 'deepspeed.initialize' took: 801.20500s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-175213_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 18:10:54][INFO][training:800] - 'deepspeed.initialize' took: 801.20500s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-184503_1536_x4702c1s4b0n0.hostmgmt2702.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 19:04:07][INFO][training:800] - 'deepspeed.initialize' took: 801.15950s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-131641_1536_x4315c4s1b0n0.hostmgmt2315.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:33:00][INFO][training:795] - 'deepspeed.initialize' took: 801.11322s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213107_1536_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 21:46:29][INFO][training:800] - 'deepspeed.initialize' took: 820.95380s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-155216_1536_x4706c2s3b0n0.hostmgmt2706.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 16:07:23][INFO][training:795] - 'deepspeed.initialize' took: 787.04806s + ws1536_ds_stage1_nl32_hs4096_mb4_seq4096_gb49152_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-141727_1536_x4102c7s0b0n0.hostmgmt2102.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:34:20][INFO][training:795] - 'deepspeed.initialize' took: 809.36787s + ``` + +
+ +-
WORLD_SIZE = 3072: + + ```bash + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-144534_3072_x4309c6s2b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 15:18:41][INFO][training:795] - 'deepspeed.initialize' took: 1639.62374s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-213304_3072_x4704c0s6b0n0.hostmgmt2704.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 22:03:15][INFO][training:800] - 'deepspeed.initialize' took: 1591.34487s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-170636_3072_x4415c2s3b0n0.hostmgmt2415.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 17:37:20][INFO][training:800] - 'deepspeed.initialize' took: 1632.12712s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-124519_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 13:16:22][INFO][training:795] - 'deepspeed.initialize' took: 1674.44393s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ws3072_ds_stage1_nl32_hs4096_mb4_seq4096_gb98304_sp1_pp1_tp1_bf16_optadamw_lr0.0003_lwf0.05/20240623-140113_3072_x4119c5s3b0n0.hostmgmt2119.cm.aurora.alcf.anl.gov/output.log: + [2024-06-23 14:30:23][INFO][training:795] - 'deepspeed.initialize' took: 1618.10035s + ``` + +
diff --git a/ALCF/pre-AuroraGPT/README.md b/ALCF/pre-AuroraGPT/README.md new file mode 100644 index 0000000000..e8f4127876 --- /dev/null +++ b/ALCF/pre-AuroraGPT/README.md @@ -0,0 +1,96 @@ +# Megatron-DeepSpeed (@ [ALCF](https://alcf.anl.gov)) + +![image](https://github.com/argonne-lcf/Megatron-DeepSpeed/assets/5234251/f06df155-30e8-4894-a4c2-c17ff4b34ada) + +We describe below the instructions for launching distributed training with +Microsoft's Megatron-DeepSpeed and briefly describe some parallelism +strategies and various optimizations that are supported. + +> [!IMPORTANT] +> We maintain this (forked) version at +> [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) +> that has some [helper scripts](#helper-scripts) for launching and setting +> various training options. +> +> These changes are entirely self-contained **HERE** in [`ALCF/`](.) + +## Setup + +1. Load `conda` and activate base environment: + + ```bash + # load conda + activate base env + module load conda/2023-10-04 ; conda activate base + ``` + +1. Clone + [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) + and navigate into it: + + ```bash + # clone + navigate into Megatron-DeepSpeed repo + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed + cd Megatron-DeepSpeed + ``` + +1. Make virtual environment (on top of base conda): + + ```bash + # make virtual environment (on top of base conda) + mkdir -p venvs/polaris/2023-10-04 + python3 -m venv venvs/polaris/2023-10-04 --system-site-packages + source venvs/polaris/2023-10-04/bin/activate + ``` + +1. Install missing dependency: + + ```bash + # install *missing dependency + python3 -m pip install "git+https://github.com/saforem2/ezpz" + ``` + +1. Launch training: + + ```bash + # ---- launch training ----------------------- + # - MODEL_SIZE_KEY: defined in ALCF/model.sh + # - other args: defined in ALCF/args.sh + # --------------------------------------------- + MODEL_SIZE_KEY="GPT25B" \ + SEQ_LEN=4096 \ + USE_FLASH_ATTN_V2=1 \ + MICRO_BATCH=1 \ + GAS=1 \ + SP_TYPE="megatron" \ + ZERO_STAGE=1 \ + ./ALCF/train-gpt3.sh + ``` + + +## Helper Scripts + +- [`pretrain_gpt_alcf.py`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/pretrain_gpt_alcf.py) +- 📂 [`ALCF/`](https://github.com/argonne-lcf/Megatron-DeepSpeed/tree/main/ALCF) + `├──` [`args.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/models.sh) + `├──` [`launch.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/launch.sh) + `├──` [`model.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/model.sh) + `├──` [`setup.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/setup.sh) + `├──` [`submit-pbs.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit-pbs.sh) + `├──` [`submit.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit.sh) + `└──` [`train-gpt3.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/train-gpt3.sh) + + +
+
pretrain_gpt_alcf.py +
Python module to be launched. Running `./ALCF/train-gpt3.sh` will automaticall build an `mpiexec` command and launch this module.
+
ALCF/train-gpt3.sh +
Main entry point for training. This script will automatically source the rest of the required ALCF/*.sh scripts below
+
ALCF/model.sh
+
Contains some example model architectures for GPT3-style models
+
ALCF/args.sh
+
Logic for parsing / setting up runtime options for Megatron and DeepSpeed.
+
ALCF/setup.sh
+
Locate and activate virtual environment to be used, ensure MPI variables are set properly
+
ALCF/launch.sh
+
Identify available resources and build the command to be ran i.e. figure out how many: `{nodes, GPUs per node, GPUs total}`, to pass to `mpi{run,exec}` then, use this to build `mpiexec {mpiexec-args} python3 pretrain_gpt.py`
+
diff --git a/ALCF/pre-AuroraGPT/args.sh b/ALCF/pre-AuroraGPT/args.sh new file mode 100755 index 0000000000..17e0018110 --- /dev/null +++ b/ALCF/pre-AuroraGPT/args.sh @@ -0,0 +1,533 @@ +#!/bin/bash --login + +function FindMegatron() { + MEGATRON_INSTALL=$(python3 -c 'import megatron; print(megatron.__file__)' | tail -1) + MEGATRON_DIR=$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1))) +} + +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" + +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-GPT13B}" +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + +sourceFile "${ALCF_DIR}/model.sh" + +MODEL_TYPE=${MODEL_TYPE:-gpt} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model Parallel / Pipeline Parallel ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +export DDP_IMPL="local" # FSDP | local | torch +export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +export SEQ_LEN=${SEQ_LEN:-2048} +export PPSIZE=${PPSIZE:-1} +export MICRO_BATCH=${MICRO_BATCH:-1} +export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt +export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron +export ZERO_STAGE=${ZERO_STAGE:-1} +export MPSIZE=${MPSIZE:-${WORLD_SIZE:-1}} +export SPSIZE=${SPSIZE:-1} + +# +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 +fi +# ------------------------------------------------------------------------ +# +echo "####################################################" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GRADIENT_ACCUMULATION_STEPS )) +echo "GB = NGPUS * MB * GAS = ${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS} = ${GLOBAL_BATCH}" + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" +export GLOBAL_BATCH="$GLOBAL_BATCH" + +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" +echo "--------------------------------" + +# ┏━━━━━━━━━━━━┓ +# ┃ Data paths ┃ +# ┗━━━━━━━━━━━━┛ +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" # GenSLMSubSample200k" + # DATA_TYPE="GenSLMSubSample200k" + # DATA_PARENT="/home/foremans/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed" + # DATA_TYPE="books-0001_text_document" + # DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +# # +# [ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed"_ +# [ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# # [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# # "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document" +# # [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# # /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document.bin +# +# # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document +# DATA_DIR="${DATA_PARENT}/dataset" +# DATA_PATH="${DATA_DIR}/genslm_subsample_200k_sequence_document" +# VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +# MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" +fi + +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_TYPE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +# if [[ -z "${NVME_PATH}" ]]; then +# echo "NVME_PATH: $NVME_PATH" +# else +# if [[ $(hostname) == x* ]]; then +# export NVME_PATH="/local/scratch/" +# elif [[ $(hostname) == theta* ]]; then +# export NVME_PATH="/raid/scratch/" +# else +# export NVME_PATH="/tmp/" +# fi +# fi + +# echo "NVME_PATH: ${NVME_PATH}" + +if [[ $MODEL_TYPE == "gpt" ]] ; then + DATA_LOAD_ARGS="--data-path $DATA_PATH --vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" +else + DATA_LOAD_ARGS="" +fi + +# Set to cpu for offloading to cpu for larger models +OFFLOAD_DEVICE="${OFFLOAD_DEVICE:-cpu}" +CPU_OPTIM=" --cpu-optimizer" + +# # Set to none and empty string for no cpu offloading +# OFFLOAD_DEVICE="none" +# CPU_OPTIM=" " + + +[ "${WANDB_MODE}" == "disabled" ] && WANDB_ENABLE="false" || WANDB_ENABLE="true" +echo "WANDB_ENABLE: ${WANDB_ENABLE}" + +# ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Config ┃ +# ┗━━━━━━━━━━━━━━━━━━┛ +DS_CONFIG=${PARENT}/ds_config-gpt.json +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" +echo " DS_CONFIG: ${DS_CONFIG}" +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" + +if [[ $ZERO_STAGE == "3" ]] ; then + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 3e9, + "stage3_max_reuse_distance": 3e9, + "stage3_param_persistence_threshold": 1e5, + "stage3_prefetch_bucket_size": 1e9, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_bucket_size": 90000000, + "sub_group_size": 5e7, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "offload_optimizer": { + "device": "cpu", + "buffer_count": 4, + "pipeline_read": false, + "pipeline_write": false, + "pin_memory": true +} +}, +"fp16": { +"enabled": true, +"initial_scale_power" : 12, +"loss_scale_window": 1000, +"hysteresis": 2, +"min_loss_scale": 1 +}, +"aio": { +"block_size": 1048576, +"queue_depth": 16, +"single_submit": false, +"overlap_events": true, +"thread_count": 2 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": $WANDB_ENABLE, +"project": "GenSLM-Megatron-DS" +} +} +EOT +else + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": $ZERO_STAGE, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "contiguous_gradients": true, + "offload_param": { + "device": "cpu", + "nvme_path": "/raid/scratch", + "pin_memory": false + }, + "offload_optimizer": { + "device": "cpu", + "nvme_path": "/raid/scratch/" +} +}, +"scheduler": { +"type": "WarmupLR", +"params": { +"warmup_min_lr": 0, +"warmup_max_lr": 0.001, +"warmup_num_steps": 1000 +} +}, +"fp16": { +"enabled": true, +"initial_scale_power": 12 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": $WANDB_ENABLE, +"project": "GenSLM-Megatron-DS" +} +} +EOT +fi + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Arguments ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + ds_args="" + ds_args=" --deepspeed ${ds_args}" + ds_args=" --deepspeed_mpi ${ds_args}" + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$PPSIZE" == 1 ]]; then + ds_args="--no-pipeline-parallel ${ds_args}" + else + ds_args=" --pipeline-model-parallel-size ${PPSIZE} ${ds_args}" + fi + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + fi +fi + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ MEGATRON-LM SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +gpt_args=( + "--no-async-tensor-model-parallel-allreduce" + "--seed ${RANDOM}" + "--DDP-impl ${DDP_IMPL}" + "--pipeline-model-parallel-size ${PPSIZE}" + "--tensor-model-parallel-size ${MPSIZE}" + "--ds-sequence-parallel-size ${SPSIZE}" + "--num-layers ${NLAYERS}" + "--hidden-size ${HIDDEN}" + "--num-attention-heads ${ATEN_HEADS}" + "--micro-batch-size ${MICRO_BATCH}" + "--global-batch-size ${GLOBAL_BATCH}" + "--seq-length ${SEQ_LEN}" + "--max-position-embeddings ${SEQ_LEN}" + "--train-iters 10" + "--lr-decay-iters 320000" + "--num-workers 0" + "$DATA_LOAD_ARGS" + "--data-impl mmap" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr 0.00015" + "--lr-decay-style cosine" + "--min-lr 1.0e-5" + "--weight-decay 1e-2" + "--clip-grad 1.0" + "--lr-warmup-fraction .01" + "--log-interval 1" + "--save-interval 1000" + "--eval-interval 1000" + "--eval-iters 0" + "--override-opt_param-scheduler" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" +) + + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" +) +fi + +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + gpt_args+=( + # "${gpt_args[*]}" + "--fp16" +) +else + gpt_args+=( + "--bf16" +) +fi + +# Flash Attention v1 +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + [ "${USE_FLASH_ATTN}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN1}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN_V1}" ] && gpt_args+=("--use-flash-attn-v1") +# Flash Attention 2 +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + [ "${USE_FLASH_ATTN2}" ] && gpt_args+=("--use-flash-attn-v2") + [ "${USE_FLASH_ATTN_V2}" ] && gpt_args+=("--use-flash-attn-v2") +# Triton + Flash Attn +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + [ "${USE_FLASH_ATTN_TRITON}" ] && gpt_args+=("--use-flash-attn-triton") +fi + +if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then + export CUDA_DEVICE_MAX_CONNECTIONS=1 + gpt_args+=( + "--sequence-parallel" +) +fi + +if [[ "$ZERO_STAGE" > "0" ]] ; then + gpt_args+=( + "--cpu-optimizer" +) +fi + +export gpt_args=( +"${gpt_args[*]}" +"${ds_args[*]}" +) +ARGS="$(join_by ' ' ${gpt_args[*]})" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "ARGS: ${ARGS}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/ALCF/pre-AuroraGPT/launch.sh b/ALCF/pre-AuroraGPT/launch.sh new file mode 100755 index 0000000000..2dd834f568 --- /dev/null +++ b/ALCF/pre-AuroraGPT/launch.sh @@ -0,0 +1,193 @@ +#!/bin/bash --login + +HOST=$(hostname) + +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +# function join_by { +# local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; +# } + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source=./setup.sh + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +MASTER_ADDR=$(uname -n) +MASTER_PORT=20010 +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +MPI_WRAPPER="${SCRIPT_DIR}/mpi_wrapper" + +# sourceFile "${ALCF_DIR}/args.sh" + +# MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" +MAIN="${PARENT}/pretrain_gpt_alcf.py" + +printJobInfo() { + echo "Job started at: ${TSTAMP} on $(hostname)" + echo "Job running in: ${DIR}" + echo "Training Llama2 with ${MODEL_SIZE} parameters" + echo "Writing logs to: ${OUTPUT_DIR}" + echo 'to view output: tail -f $(tail -1 logfiles)' + echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" +} + +launchJob() { + echo "using: $(which python3)" | tee -a "${OUTPUT_LOG}" + printJobInfo | tee -a "${OUTPUT_LOG}" + echo EXEC="${EXEC}" | tee -a "${OUTPUT_LOG}" + echo "Writing logs to: ${OUTPUT_LOG}" | tee -a "${OUTPUT_LOG}" + # ARGS="$@" + # export ARGS="$ARGS" + ${EXEC} "$@" # >> "${OUTPUT_LOG}" 2>&1 & +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs a single nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +fullNode() { + echo "fullNode started" + echo "MPI_COMMAND ${MPI_COMMAND}" + echo "MPI_DEFAULTS ${MPI_DEFAULTS}" + echo "NGPUS ${NGPUS}" + echo "hostfile ${DIR}/hostfile" + echo "MAIN ${MAIN}" + echo "gpt_args ${ARGS}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + # hostname > $DIR/hostfile + echo "\ + Running on $NHOSTS hosts \ + with $NGPU_PER_HOST GPUs each \ + for a total of $NGPUS GPUs" + _EXEC=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "${MPI_WRAPPER}" + "${MASTER_ADDR}" + "${MASTER_PORT}" + "${MAIN}" + "${ARGS}" + # "${ds_args}" + ) + # EXEC=$(join_by ' ' "${EXEC[*]}") + EXEC="${EXEC[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" +} + + +function setupSrunOld() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs on all available nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +elasticDistributed() { + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + HOSTFILE="${HOSFILE:-${PBS_NODEFILE}}" + else + echo "Unknown hostname $(hostname)" + exit 1 + fi + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export MASTER_ADDR="127.0.0.1" + export MASTER_PORT="5432" + EXEC_STR=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "$(which python3)" + "${MAIN}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" + ) + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then + echo "Setting up from Perlmutter on $(hostname)" + # NHOSTS=${SLURM_NNODES-1} + MACHINE="Perlmutter" + setupPerlmutter + setupSrun + echo "SRUN_EXEC: ${SRUN_EXEC}" + export MASTER_ADDR="$SLURMD_NODENAME" + EXEC_STR=( + "${SRUN_EXEC}" + "$(which python3)" + "${MAIN}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" + ) + else + echo "Unexpected hostname $(hostname)" + fi + export WORLD_SIZE="${NGPUS}" + echo "\ + Running on ${NHOSTS} hosts \ + with ${NGPU_PER_HOST} GPUs each \ + for a total of ${NGPUS} GPUs" + EXEC="${EXEC_STR[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + echo "EXEC_STR: ${EXEC_STR}" + echo "Writing logs to: ${OUTPUT_LOG}" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + PID=$! + wait $PID +} diff --git a/ALCF/pre-AuroraGPT/llama2_vars.sh b/ALCF/pre-AuroraGPT/llama2_vars.sh new file mode 100755 index 0000000000..2fc8c3898a --- /dev/null +++ b/ALCF/pre-AuroraGPT/llama2_vars.sh @@ -0,0 +1,435 @@ +#!/bin/bash +# This example script is contributed by external user https://github.com/nrailgun +# [2023-12-20]: Modified by [@saforem2](https://github.com/saforem2) +# set -ex +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" + +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + +# MODEL_LLAMA_KEY="LLAMA-24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# FFN_HIDDEN_SIZE=5504 +# NUM_HEADS=16 # e.g. llama-13b: 40 +###################################### +# Change the below configurations here +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +# echo "ALCF_DIR: ${ALCF_DIR}" +# # echo "PARENT: ${PARENT}" +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +MEGATRON_DIR="${HERE}" + +# DATA_DIR="${HOME}/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DeepSpeed/dataset/" +BASE_PATH="${MEGATRON_DIR}" +DS_CONFIG=${BASE_PATH}/deepspeed.json +DATASET_1="${DATA_DIR}/BookCorpusDataset_text_document" +# DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" +DATASET="1 ${DATASET_1}" +# CHECKPOINT_PATH=./tmp +TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model + +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "MEGATRON_DIR: ${MEGATRON_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +DATA_LOAD_ARGS=( + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" +) + +# TP=2 +# PP=2 +# ZERO_STAGE=0 + +GPUS_PER_NODE=$(nvidia-smi -L | wc -l) +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=$(wc -l < "${PBS_NODEFILE:-${COBALT_NODEFILE:-1}}") +NODE_RANK=0 + +# TP=2 +# PP=2 +# ZERO_STAGE=0 +# +export SEQ_LENGTH=${SEQ_LENGTH:-2048} +export NUM_KV_HEADS=4 # llama2 70B uses GQA +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-LLAMA_7B}" +export MODEL_TYPE=${MODEL_TYPE:-llama} +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + + +export DDP_IMPL="local" +export GAS=${GAS:-1} +export MPSIZE=${MPSIZE:-1} +export SPSIZE=${SPSIZE:-1} +export PPSIZE=${PPSIZE:-1} +export SP_TYPE=${SP_TYPE:-"ds"} +export MICRO_BATCH=${MICRO_BATCH:-1} + +# export HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# export FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# export NUM_LAYERS=24 # e.g. llama-13b: 40 +# export NUM_HEADS=16 # e.g. llama-13b: 40 +# export SEQ_LENGTH=${SEQ_LENGTH:-2048} +# export NUM_KV_HEADS=4 # llama2 70B uses GQA + +NUM_KV_HEADS=4 # llama2 70B uses GQA +FFN_HIDDEN_SIZE=5504 + +# GLOBAL_BATCH=32 # e.g. llama: 4M tokens +TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps +LR=3e-4 +MIN_LR=3e-5 +LR_WARMUP_STEPS=2000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +# Below configuration required for llama model as per llama paper +# --no-query-key-layer-scaling \ +# --attention-dropout 0 \ +# --hidden-dropout 0 \ +# --use-rotary-position-embeddings \ +# --untie-embeddings-and-output-weights \ +# --swiglu \ +# --normalization rmsnorm \ +# --disable-bias-linear \ +###################################### + +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 +fi +# ------------------------------------------------------------------------ +# +echo "####################################################" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GAS )) + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) + +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP * DP) = ${NGPUS} * ${MICRO_BATCH} * ${GAS} = ${GLOBAL_BATCH} / (${MPSIZE} * ${PPSIZE} * ${PPSIZE})" +# echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GAS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" + +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +export GLOBAL_BATCH="$(( GLOBAL_BATCH * DPSIZE ))" +# export GLOBAL_BATCH="$GLOBAL_BATCH" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" +echo "--------------------------------" + +# REMAINDER=$(( GLOBAL_BATCH % (MICRO_BATCH * DPSIZE))) +# if [[ "${GLOBAL_BATCH} "]] + + + + +cat < $DS_CONFIG +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "wandb": { + "enabled": true, + "project": "GenSLM-Megatron-DS" + } +} +EOT + +ds_args="" +ds_args=" --deepspeed ${ds_args}" +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [ "${activation_checkpoint}" = "true" ]; then + ds_args="--deepspeed-activation-checkpointing ${ds_args}" + + ## old argument for recomputing the transformer layer + # ds_args="--checkpoint-activations ${ds_args}" + + ## new argument for recomputing the transformer layer + ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" + ## new argument for recomputing only the attention layer + # ds_args="--recompute-granularity selective ${ds_args}" +fi + + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +# torchrun $DISTRIBUTED_ARGS \ +# pretrain_gpt.py \ + + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE_KEY}_${RUN_STR}" + +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" +fi + +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" +export MODEL_SIZE="${MODEL_SIZE:-${MODEL_SIZE_KEY}}" +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +gpt_args=( + "--tensor-model-parallel-size $MPSIZE" + "--pipeline-model-parallel-size $PPSIZE" + "--num-layers $NLAYERS" + "--hidden-size $HIDDEN" + "--ffn-hidden-size $FFN_HIDDEN_SIZE" + "--num-attention-heads $ATEN_HEADS" + "--micro-batch-size $MICRO_BATCH" + "--global-batch-size $GLOBAL_BATCH" + "--seq-length $SEQ_LENGTH" + "--max-position-embeddings $SEQ_LENGTH" + "--train-iters $TRAIN_STEPS" + "--save $CHECKPOINT_DIR" + "--load $CHECKPOINT_DIR" + "--data-path $DATASET" + "--data-impl mmap" + "--tokenizer-type GPTSentencePieceTokenizer" + "--tokenizer-model $TOKENIZER_PATH" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr $LR" + "--lr-decay-style cosine" + "--min-lr $MIN_LR" + "--weight-decay $WEIGHT_DECAY" + "--clip-grad $GRAD_CLIP" + "--lr-warmup-iters $LR_WARMUP_STEPS" + "--optimizer adam" + "--adam-beta1 0.9" + "--adam-beta2 0.95" + "--log-interval 1" + "--save-interval 10000" + "--eval-interval 1000" + "--eval-iters 10" + "--bf16" + "--no-query-key-layer-scaling" + "--attention-dropout 0" + "--hidden-dropout 0" + "--use-rotary-position-embeddings" + "--untie-embeddings-and-output-weights" + "--swiglu" + "--normalization rmsnorm" + "--disable-bias-linear" + "--num-key-value-heads $NUM_KV_HEADS" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" +) + +# DATA_LOAD_ARGS=( +# "--data-path $DATA_PATH" +# "--vocab-file $VOCAB_FILE" +# "--merge-file $MERGE_FILE" +# ) + +export gpt_args=( + "${gpt_args[*]}" + "${ds_args[*]}" + # "${DATA_LOAD_ARGS[*]}" +) +ARGS="$(join_by ' ' ${gpt_args[*]})" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "ARGS: ${ARGS}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +# gpt_args+="${ds_args}" +# gpt_args+="${DATA_LOAD_ARGS}" diff --git a/ALCF/pre-AuroraGPT/model.sh b/ALCF/pre-AuroraGPT/model.sh new file mode 100755 index 0000000000..ef01f5541d --- /dev/null +++ b/ALCF/pre-AuroraGPT/model.sh @@ -0,0 +1,383 @@ +#!/bin/bash --login +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ GPT MODEL SETTINGS ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model / Architecture settings ┃ +# ┃ ---------------------------------------------------- ┃ +# ┃ GPT-3 models use 2K sequence length/context window ┃ +# ┃ The "GPT-3 XXX" below are configs from GPT-3 paper ┃ +# ┃ https://arxiv.org/abs/2005.14165, choose based on ┃ +# ┃ your desired model size or build your own configs ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +declare -A A_NLAYERS +declare -A A_HIDDEN +declare -A A_ATEN_HEADS + +# | =================== Llama 2 Architecture ==================== | +# | Hidden Size | Inter. Size | Atten Heads | Layers | Model Size | +# |:-----------:|:-----------:|:-----------:|:------:|:----------:| +# | 4096 | 11008 | 32 | 32 | 7b | +# | 5120 | 13824 | 40 | 40 | 13b | +# | 8192 | 28672 | 64 | 80 | 70b | + +MODEL_LLAMA_7B_KEY="LLAMA_7B" +A_NLAYERS[$MODEL_LLAMA_7B_KEY]=32 +A_ATEN_HEADS[$MODEL_LLAMA_7B_KEY]=32 +A_HIDDEN[$MODEL_LLAMA_7B_KEY]=4096 + +MODEL_LLAMA_13B_KEY="LLAMA_13B" +A_NLAYERS[$MODEL_LLAMA_13B_KEY]=40 +A_ATEN_HEADS[$MODEL_LLAMA_13B_KEY]=40 +A_HIDDEN[$MODEL_LLAMA_13B_KEY]=5120 + +MODEL_LLAMA_70B_KEY="LLAMA_70B" +A_NLAYERS[$MODEL_LLAMA_70B_KEY]=80 +A_ATEN_HEADS[$MODEL_LLAMA_70B_KEY]=64 +A_HIDDEN[$MODEL_LLAMA_70B_KEY]=8192 + +# HIDDEN_SIZE=4096 +# NUM_LAYERS=24 # e.g. llama-13b: 40 + + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Llama2 ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE_KEY="LLAMA_24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# + + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3 Small: 125M ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +MODEL_125M_KEY="GPT125M" +A_NLAYERS[$MODEL_125M_KEY]=12 +A_HIDDEN[$MODEL_125M_KEY]=768 +A_ATEN_HEADS[$MODEL_125M_KEY]=16 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ BERT: 1.2B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="BERT1.2B" +# NLAYERS=24 +# HIDDEN=2048 +# ATEN_HEADS=128 + +BERT_1_2B_KEY="BERT1.2B" +A_NLAYERS[$BERT_1_2B_KEY]=24 +A_HIDDEN[$BERT_1_2B_KEY]=2048 +A_ATEN_HEADS[$BERT_1_2B_KEY]=128 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1536 +# ATEN_HEADS=24 + +MODEL_1_5B_KEY="GPT1_5B" +A_NLAYERS[$MODEL_1_5B_KEY]=48 +A_HIDDEN[$MODEL_1_5B_KEY]=1536 +A_ATEN_HEADS[$MODEL_1_5B_KEY]=24 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1600 +# ATEN_HEADS=25 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 2.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="2.7B" +# NLAYERS=32 +# HIDDEN=2560 +# ATEN_HEADS=32 + +MODEL_2_7B_KEY="GPT2_7B" +A_NLAYERS[$MODEL_2_7B_KEY]=32 +A_HIDDEN[$MODEL_2_7B_KEY]=2560 +A_ATEN_HEADS[$MODEL_2_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 6.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="6.7B" +# NLAYERS=32 +# HIDDEN=4096 +# ATEN_HEADS=32 + +MODEL_6_7B_KEY="GPT6_7B" +A_NLAYERS[$MODEL_6_7B_KEY]=32 +A_HIDDEN[$MODEL_6_7B_KEY]=4096 +A_ATEN_HEADS[$MODEL_6_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 13B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="13B" +# NLAYERS=40 +# HIDDEN=5120 +# ATEN_HEADS=40 + +MODEL_13B_KEY="GPT13B" +A_NLAYERS[$MODEL_13B_KEY]=40 +A_HIDDEN[$MODEL_13B_KEY]=5120 +A_ATEN_HEADS[$MODEL_13B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 18.4B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="18.4B" +# NLAYERS=40 +# HIDDEN=6144 +# ATEN_HEADS=48 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 20B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="20B" +# NLAYERS=44 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 25B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="25B" +# NLAYERS=64 +# ------------ +# HIDDEN=5760 # DEFAULT (no flash attn) +# ATEN_HEADS=64 +# ------------ +# HIDDEN=5888 # headdim = 5888 / 46 = 128 +# ATEN_HEADS=46 +# ----------------- +# -- FLASH ATTN -- +# headdim = 5760 / 80 = 72 +# HIDDEN=5760 +# ATEN_HEADS=80 +# ------------ + +MODEL_25B_KEY="GPT25B" +A_NLAYERS[$MODEL_25B_KEY]=64 +A_HIDDEN[$MODEL_25B_KEY]=6144 +A_ATEN_HEADS[$MODEL_25B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 30B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="30B" +# NLAYERS=64 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# head size must be divisible by 8 (requirements of flash attention) +# head num must be divisible by sequence/tensor parallel size +MODEL_30B_KEY="GPT30B" +A_NLAYERS[$MODEL_30B_KEY]=64 +A_HIDDEN[$MODEL_30B_KEY]=6144 +A_ATEN_HEADS[$MODEL_30B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 33B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="33B" +# NLAYERS=80 +# HIDDEN=5760 +# ATEN_HEADS=80 + +# MODEL_33B_KEY="GPT33B" +# A_NLAYERS[$MODEL_33B_KEY]=80 +# A_HIDDEN[$MODEL_33B_KEY]=5760 +# A_ATEN_HEADS[$MODEL_33B_KEY]=80 + +MODEL_33B_KEY="GPT33B" +A_NLAYERS[$MODEL_33B_KEY]=80 +A_HIDDEN[$MODEL_33B_KEY]=6144 +A_ATEN_HEADS[$MODEL_33B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 145B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="145B" +# NLAYERS=80 +# HIDDEN=12288 +# ATEN_HEADS=96 +# +GPT145B_HIDDEN=12288 +GPT145B_ATEN_HEADS=96 + +MODEL_145B_2L_KEY="GPT145B_2L" +A_NLAYERS[$MODEL_145B_2L_KEY]=2 +A_HIDDEN[$MODEL_145B_2L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_2L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_4L_KEY="GPT145B_4L" +A_NLAYERS[$MODEL_145B_4L_KEY]=4 +A_HIDDEN[$MODEL_145B_4L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_4L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_5L_KEY="GPT145B_5L" +A_NLAYERS[$MODEL_145B_5L_KEY]=5 +A_HIDDEN[$MODEL_145B_5L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_5L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_6L_KEY="GPT145B_6L" +A_NLAYERS[$MODEL_145B_6L_KEY]=6 +A_HIDDEN[$MODEL_145B_6L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_6L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_8L_KEY="GPT145B_8L" +A_NLAYERS[$MODEL_145B_8L_KEY]=8 +A_HIDDEN[$MODEL_145B_8L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_8L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_10L_KEY="GPT145B_10L" +A_NLAYERS[$MODEL_145B_10L_KEY]=10 +A_HIDDEN[$MODEL_145B_10L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_10L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_12L_KEY="GPT145B_12L" +A_NLAYERS[$MODEL_145B_12L_KEY]=12 +A_HIDDEN[$MODEL_145B_12L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_12L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_16L_KEY="GPT145B_16L" +A_NLAYERS[$MODEL_145B_16L_KEY]=16 +A_HIDDEN[$MODEL_145B_16L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_16L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_24L_KEY="GPT145B_24L" +A_NLAYERS[$MODEL_145B_24L_KEY]=24 +A_HIDDEN[$MODEL_145B_24L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_24L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_32L_KEY="GPT145B_32L" +A_NLAYERS[$MODEL_145B_32L_KEY]=32 +A_HIDDEN[$MODEL_145B_32L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_32L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_48L_KEY="GPT145B_48L" +A_NLAYERS[$MODEL_145B_48L_KEY]=48 +A_HIDDEN[$MODEL_145B_48L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_48L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_64L_KEY="GPT145B_64L" +A_NLAYERS[$MODEL_145B_64L_KEY]=64 +A_HIDDEN[$MODEL_145B_64L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_64L_KEY]="${GPT145B_ATEN_HEADS}" + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 175B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="175B" +# NLAYERS=96 +# HIDDEN=12288 +# ATEN_HEADS=96 +# if [ -z "$NLAYERS" ]; then +# A_NLAYERS[$MODEL_145B_KEY]="${NLAYERS}" +# echo "Caught NLAYERS=${NLAYERS} from env, using this value!" +# else +# A_NLAYERS[$MODEL_145B_KEY]=80 +# echo "Using default NLAYERS=80" +# fi +MODEL_145B_KEY="GPT145B" +A_NLAYERS[$MODEL_145B_KEY]=80 +A_HIDDEN[$MODEL_145B_KEY]=12288 +A_ATEN_HEADS[$MODEL_145B_KEY]=96 + + + +MODEL_1T_HIDDEN=25600 +MODEL_1T_ATEN_HEADS=160 +MODEL_1T_1L_KEY="GPT1T_1L" +A_NLAYERS[$MODEL_1T_1L_KEY]=1 +A_HIDDEN[$MODEL_1T_1L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_2L_KEY="GPT1T_2L" +A_NLAYERS[$MODEL_1T_2L_KEY]=2 +A_HIDDEN[$MODEL_1T_2L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_2L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_4L_KEY="GPT1T_4L" +A_NLAYERS[$MODEL_1T_4L_KEY]=4 +A_HIDDEN[$MODEL_1T_4L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_4L_KEY]="${MODEL_1T_ATEN_HEADS}" + + +MODEL_1T_8L_KEY="GPT1T_8L" +A_NLAYERS[$MODEL_1T_8L_KEY]=8 +A_HIDDEN[$MODEL_1T_8L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_8L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_16L_KEY="GPT1T_16L" +A_NLAYERS[$MODEL_1T_16L_KEY]=16 +A_HIDDEN[$MODEL_1T_16L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_16L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_24L_KEY="GPT1T_24L" +A_NLAYERS[$MODEL_1T_24L_KEY]=24 +A_HIDDEN[$MODEL_1T_24L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_24L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_30L_KEY="GPT1T_30L" +A_NLAYERS[$MODEL_1T_30L_KEY]=30 +A_HIDDEN[$MODEL_1T_30L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_30L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_32L_KEY="GPT1T_32L" +A_NLAYERS[$MODEL_1T_32L_KEY]=32 +A_HIDDEN[$MODEL_1T_32L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_32L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_60L_KEY="GPT1T_60L" +A_NLAYERS[$MODEL_1T_60L_KEY]=60 +A_HIDDEN[$MODEL_1T_60L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_60L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_64L_KEY="GPT1T_64L" +A_NLAYERS[$MODEL_1T_64L_KEY]=64 +A_HIDDEN[$MODEL_1T_64L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_64L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_120L_KEY="GPT1T_120L" +A_NLAYERS[$MODEL_1T_120L_KEY]=120 +A_HIDDEN[$MODEL_1T_120L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_120L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_128L_KEY="GPT1T_128L" +A_NLAYERS[$MODEL_1T_128L_KEY]=128 +A_HIDDEN[$MODEL_1T_128L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_128L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_256L_KEY="GPT1T_256L" +A_NLAYERS[$MODEL_1T_256L_KEY]=256 +A_HIDDEN[$MODEL_1T_256L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_256L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_512L_KEY="GPT1T_512L" +A_NLAYERS[$MODEL_1T_512L_KEY]=512 +A_HIDDEN[$MODEL_1T_512L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_512L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_1024L_KEY="GPT1T_1024L" +A_NLAYERS[$MODEL_1T_1024L_KEY]=1024 +A_HIDDEN[$MODEL_1T_1024L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1024L_KEY]="${MODEL_1T_ATEN_HEADS}" + +export MODEL_SIZE="${MODEL_SIZE_KEY}" +export NLAYERS="${A_NLAYERS[$MODEL_SIZE_KEY]}" +export HIDDEN="${A_HIDDEN[$MODEL_SIZE_KEY]}" +export ATEN_HEADS="${A_ATEN_HEADS[$MODEL_SIZE_KEY]}" diff --git a/ALCF/pre-AuroraGPT/setup.sh b/ALCF/pre-AuroraGPT/setup.sh new file mode 100755 index 0000000000..5259ea03a0 --- /dev/null +++ b/ALCF/pre-AuroraGPT/setup.sh @@ -0,0 +1,291 @@ +#!/bin/bash --login +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function setupVenv() { + VENV_DIR="$1" + # VENV_DIR="${PARENT}/venvs/perlmutter/torch2.0.1/" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi +} + +function loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" + else + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + conda activate base + fi +} + +function thetagpuMPI() { + if [[ $(hostname) == theta* ]]; then + export HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + NVME_PATH="/raid/scratch/" + MPI_COMMAND=$(which mpirun) + # export PATH="${CONDA_PREFIX}/bin:${PATH}" + MPI_DEFAULTS="\ + --hostfile ${HOSTFILE} \ + -x CFLAGS \ + -x LDFLAGS \ + -x http_proxy \ + -x CUDA_DEVICE_MAX_CONNECTIONS \ + -x PYTHONUSERBASE \ + -x https_proxy \ + -x PATH \ + -x LD_LIBRARY_PATH" + MPI_ELASTIC="\ + -n ${NGPUS} \ + -npernode ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--hostfile ${HOSTFILE}" + # "-x CFLAGS" + # "-x LDFLAGS" + # "-x http_proxy" + # "-x PYTHONUSERBASE" + # "-x https_proxy" + # "-x PATH" + # "-x CUDA_DEVICE_MAX_CONNECTIONS" + # "-x LD_LIBRARY_PATH" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "-npernode ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping thetaGPUMPI() on $(hostname)" + fi +} + +function polarisMPI() { + if [[ $(hostname) == x3* ]]; then + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + export NHOSTS=$(wc -l < "${HOSTFILE}") + export NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + export NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + export MPI_COMMAND=$(which mpiexec) + export NVME_PATH="/local/scratch/" + MPI_DEFAULTS="\ + --envall \ + --verbose \ + --hostfile ${HOSTFILE}" + MPI_ELASTIC="\ + -n ${NGPUS} \ + --ppn ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--envall" + # "--verbose" + # "--hostfile ${HOSTFILE}" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "--ppn ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping polarisMPI() on $(hostname)" + fi +} + +function setupMPI() { + if [[ $(hostname) == theta* ]]; then + echo "Setting up MPI on ThetaGPU from $(hostname)" + thetagpuMPI + elif [[ $(hostname) == x* ]]; then + echo "Setting up MPI on Polaris from $(hostname)" + polarisMPI + else + echo "Skipping setupMPI() on hostname $(hostname)" + fi + echo "++ SetupMPI() +++++++++++++++++++++++++++++++++" + echo "Using HOSTFILE: $HOSTFILE" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: $NGPUS" + echo "+++++++++++++++++++++++++++++++++++++++++++++++" +} + +function condaPolaris() { + if [[ $(hostname) == x3* ]]; then + DATE_STR="2023-10-04" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaPolaris() on $(hostname)" + fi +} + +function condaThetaGPU() { + if [[ $(hostname) == theta* ]]; then + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + # [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + # [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaThetaGPU() on $(hostname)" + fi +} + +function setupThetaGPU() { + export LAB="ALCF" + export MACHINE="ThetaGPU" + if [[ $(hostname) == theta* ]]; then + setupMPI + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + else + echo "Skipping setupThetaGPU() on $(hostname)" + fi +} + +function setupPolaris() { + export LAB="ALCF" + export MACHINE="Polaris" + if [[ $(hostname) == x3* ]]; then + # SETUP MPI -------------------------------- + setupMPI + # SETUP Python -------------------------------- + DATE_STR="2023-09-29" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}-unstable" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + else + echo "Skipping setupPolaris() on $(hostname)" + fi +} + + +function setupALCF() { + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + setupMPI + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + setupThetaGPU + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + setupPolaris + else + echo "Unknown hostname $(hostname) in setupALCF()" + fi + else + echo "Skipping setupALCF() on $(hostname)" + fi +} + + + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + +# ┏━━━━━━━┓ +# ┃ NERSC ┃ +# ┗━━━━━━━┛ +function setupPerlmutter() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + module load libfabric cudatoolkit pytorch/2.0.1 + [ $SLURM_JOB_ID ] \ + && echo "Caught SLURM_JOB_ID: ${SLURM_JOB_ID}" \ + || echo "!!!!!! Running without SLURM allocation !!!!!!!!" + # if [[ $(hostname) == login* ]]; then + # export MACHINE="NERSC" + # module load pytorch/2.0.1 + # export NHOSTS=1 + # export NGPU_PER_HOST=1 + # export NGPUS=1 + # # echo "$(hostname)" > "${HERE}/hostfile" + # elif [[ $(hostname) == nid* ]]; then + # export NODE_RANK=0 + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + # else + # echo "Unexpected $(hostname) on NERSC" + # fi + echo "+++++++++++++++++++++++++++++++++++" + echo "Using python: $(which python3)" + echo "+++++++++++++++++++++++++++++++++++" + else + echo "Skipping setupPerlmutter() on $(hostname)" + fi +} + + +function setupMachine() { + HOSTNAME="$(hostname)" + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + export LAB="ALCF" + setupALCF + # [ "${HOSTNAME}==theta*" ] && condaThetaGPU + # [ "${HOSTNAME}==x3*" ] && condaPolaris + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then + export LAB="NERSC" + setupSrun + setupPerlmutter + # [ "${HOSTNAME}==login*" ] && setupPerlmutter + # [ "${HOSTNAME}==nid*" ] && setupPerlmutter + else + echo "Unexpected hostname: $(hostname)" + fi +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +function setup() { + export NCCL_DEBUG=warn + # TORCH_EXTENSIONS_DIR="${HERE}/.cache/torch_extensions" + export WANDB_CACHE_DIR="./cache/wandb" + setupMachine + PYTHON_EXECUTABLE="$(which python3)" + export PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" + echo "USING PYTHON: $(which python3)" + # echo "CFLAGS: ${CFLAGS}" + # echo "LDFLAGS: ${LDFLAGS}" + # export NODE_RANK=0 + export NNODES=$NHOSTS + export GPUS_PER_NODE=$NGPU_PER_HOST + export WORLD_SIZE=$NGPUS + export NGPUS="${NGPUS}" + export NHOSTS="${NHOSTS}" + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export CUDA_DEVICE_MAX_CONNECTIONS=1 + echo "########################################" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: (${NHOSTS} * ${NGPU_PER_HOST}) = ${NGPUS}" + echo "########################################" +} + +setup diff --git a/ALCF/pre-AuroraGPT/submit-pbs.sh b/ALCF/pre-AuroraGPT/submit-pbs.sh new file mode 100755 index 0000000000..712a932656 --- /dev/null +++ b/ALCF/pre-AuroraGPT/submit-pbs.sh @@ -0,0 +1,263 @@ +#!/bin/bash --login +# + +cd "${PBS_O_WORKDIR}" || exit + +# cd "${PBS_O_WORKDIR}" +# +# echo "PBS_O_WORKDIR: ${PBS_O_WORKDIR}" +# +# echo "__________________________________________________________________________________" +# cd ~/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ +# echo "pwd: $(pwd)" +# echo "__________________________________________________________________________________" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +TSTAMP=$(tstamp) +echo "┌──────────────────────────────────────────────────────────────────┐" +#####"│ Job Started at 2023-08-04-121535 on polaris-login-04 by foremans │" +echo "│ Job Started at ${TSTAMP} on $(hostname) by $USER │" +echo "│ in: ${PARENT}" +echo "└──────────────────────────────────────────────────────────────────┘" +# echo "------------------------------------------------------------------------" + +getValFromFile() { + FILE=$1 + KEY=$2 + echo "getting ${KEY} from ${FILE}" + if [[ -f "${FILE}" ]]; then + VAL="$(cat "${FILE}" | grep -E "^${KEY}=" | sed "s/${KEY}=//g" | sed 's/\"//g')" + echo "setting ${KEY}: ${VAL}" + export "${KEY}"="${VAL}" + fi +} + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} +# +# sourceFile "${DIR}/setup.sh" +# sourceFile "${DIR}/model.sh" +# sourceFile "${DIR}/args.sh" +# sourceFile "${DIR}/launch.sh" +# +# export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# +# +# export MODEL_SIZE_KEY="GPT1_5B" +# export SEQ_LEN=1024 +# export USE_FLASH_ATTN=1 +# export MICRO_BATCH=4 +# export WORLD_SIZE=8 +# export SP_TYPE="ds" +# export SPSIZE=8 +# export PPSIZE=1 +# export MPSIZE=1 +# export ZERO_STAGE=3 +# export USE_SEQUENCE_PARALLEL=0 + + +# getValFromFile "${DIR}/model.sh" MODEL_SIZE +# getValFromFile "${DIR}/args.sh" PPSIZE +# getValFromFile "${DIR}/args.sh" MPSIZE +# getValFromFile "${DIR}/args.sh" MICRO_BATCH +# getValFromFile "${DIR}/args.sh" GRADIENT_ACCUMULATION_STEPS +# +# MODEL_SIZE="${MODEL_SIZE}" +# PPSIZE="${PPSIZE}" +# MPSIZE="${MPSIZE}" +# MICRO_BATCH="${MICRO_BATCH}" +# GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS}" + +QUEUE=$1 +NUM_NODES=$2 +DURATION=$3 +PROJECT=$4 + +# MODEL_SIZE_KEY=$5 +# SEQ_LEN=$6 +# USE_FLASH_ATTN=$7 +# MICRO_BATCH=$8 +# GAS=$9 +# SP_TYPE=$10 + +# MODEL_SIZE_KEY="GPT6_7B" SEQ_LEN=2048 USE_FLASH_ATTN=0 MICRO_BATCH=1 GAS=1 SP_TYPE="deepspeed" ./ALCF/submit-pbs.sh debug-scaling 4 00:30:00 datascience + +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export MICRO_BATCH="${MICRO_BATCH}" +# export MODEL_SIZE="${MODEL_SIZE}" +# # export GAS="${GRADIENT_ACCUMULATION_STEPS}" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# +# export DDP_IMPL="local" # FSDP | local | torch +# # export USE_FLASH_ATTN=${USE_FLASH_ATTN:-0} # 1 | 0 +# # export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# # export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# # export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# + +if [ -z "${MODEL_SIZE_KEY}" ]; then + echo "ERROR: MODEL_SIZE_KEY not set" + exit 1 +fi + +if [ -z "${SEQ_LEN}" ]; then + echo "ERROR: SEQ_LEN not set" + echo "Using default SEQ_LEN=2048" + echo "Set SEQ_LEN=XXXX to change" + SEQ_LEN=2048 +fi + +if [ -z "${USE_FLASH_ATTN}" ]; then + echo "ERROR: USE_FLASH_ATTN not set" + echo "Not using flash attn! Set USE_FLASH_ATTN=1 to use" + USE_FLASH_ATTN=0 +fi + +if [ -z "${MICRO_BATCH}" ]; then + echo "ERROR: MICRO_BATCH not set" + echo "Using MICRO_BATCH=1" + MICRO_BATCH=1 +fi + +if [ -z "${GAS}" ]; then + echo "ERROR: GAS not set" + echo "Using GAS=1" + GAS=1 +fi + +if [ -z "${SP_TYPE}" ]; then + echo "ERROR: SP_TYPE not set" + echo "Using SP_TYPE=megatron" + SP_TYPE="megatron" +fi + +export GAS="${GAS}" +export SEQ_LEN="${SEQ_LEN}" +export SP_TYPE="${SP_TYPE}" +export MICRO_BATCH="${MICRO_BATCH}" +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY}" +export USE_FLASH_ATTN="${USE_FLASH_ATTN}" + +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" +echo "| MODEL_SIZE_KEY: ${MODEL_SIZE_KEY}" +echo "| SEQ_LEN: ${SEQ_LEN}" +echo "| USE_FLASH_ATTN: ${USE_FLASH_ATTN}" +echo "| MICRO_BATCH: ${MICRO_BATCH}" +echo "| GAS: ${GAS}" +echo "| SP_TYPE: ${SP_TYPE}" +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" + +export QUEUE="${QUEUE}" +export DURATION="${DURATION}" +export TSTAMP="${TSTAMP}" +export NUM_NODES="${NUM_NODES}" +export PROJECT="${PROJECT}" + +RUN_NAME="N${NUM_NODES}-${TSTAMP}" +# RUN_NAME="mb${MICRO_BATCH}-gas${GAS}-${RUN_NAME}" +# RUN_NAME="GPT3-${MODEL_SIZE}-${RUN_NAME}" +RUN_NAME="${MODEL_SIZE_KEY}-${SP_TYPE}-mb${MICRO_BATCH}-gas${GAS}-seqlen${SEQ_LEN}-${RUN_NAME}" +export RUN_NAME="${RUN_NAME}" + +echo "QUEUE=$QUEUE" +echo "PROJECT=$PROJECT" +echo "DURATION=$DURATION" +echo "TSTAMP=$TSTAMP" +echo "NUM_NODES=$NUM_NODES" +echo "RUN_NAME: ${RUN_NAME}" +# echo "MODEL_SIZE=$MODEL_SIZE" +# echo "GAS=$GRADIENT_ACCUMULATION_STEPS" + +# QSUB_ARGS=( +# "-q ${QUEUE}" +# "-A ${PROJECT}" +# "-N ${RUN_NAME}" +# "-l select=${NUM_NODES}" +# "-l walltime=${DURATION}" +# "-l filesystems=eagle:home:grand" +# "${DIR}/submit.sh" +# ) + +OUTPUT=$(qsub \ + -q "${QUEUE}" \ + -A "${PROJECT}" \ + -N "${RUN_NAME}" \ + -l select="${NUM_NODES}" \ + -l walltime="${DURATION}" \ + -l filesystems=eagle:home:grand \ + "${ALCF_DIR}/submit.sh") + +# OUTPUT=$(qsub "${QSUB_ARGS[@]}") + +PBS_JOBID=$(echo "${OUTPUT}" | cut --delimiter="." --fields=1) +export PBS_JOBID="${PBS_JOBID}" +# echo "${TSTAMP} ${PBS_JOBID} " + +PBS_JOBSTR=( + "PBS_JOBID=${PBS_JOBID}" + "QUEUE=$QUEUE" + "PROJECT=$PROJECT" + "DURATION=$DURATION" + "TSTAMP=$TSTAMP" + "NUM_NODES=$NUM_NODES" + # "MODEL_SIZE=$MODEL_SIZE" + "RUN_NAME: ${RUN_NAME}" +) + # "GAS=$GRADIENT_ACCUMULATION_STEPS" + +TODAY=$(echo "${TSTAMP}" | cut --delimiter="-" --fields=1,2,3) +OUTFILE="${PARENT}/pbslogs/${TODAY}/${PBS_JOBID}.txt" + +if [[ ! -d $(dirname "${OUTFILE}") ]]; then + mkdir -p "$(dirname "${OUTFILE}")" +fi + +echo "Writing PBS_JOBSTR to ${OUTFILE}" +echo "${PBS_JOBSTR[@]}" >> "${OUTFILE}" +# echo "${PBS_JOBSTR[@]}" | tee -a "${OUTFILE}" + +echo "┌───────────────────────────────────────────┐" +echo "│ To view job output, run: \`pbstail ${PBS_JOBID}\` │" +echo "└───────────────────────────────────────────┘" diff --git a/ALCF/pre-AuroraGPT/submit.sh b/ALCF/pre-AuroraGPT/submit.sh new file mode 100755 index 0000000000..6842eb66b6 --- /dev/null +++ b/ALCF/pre-AuroraGPT/submit.sh @@ -0,0 +1,66 @@ +#!/bin/bash --login +#PBS -V +# +cd "${PBS_O_WORKDIR}" || exit + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") +export TSTAMP="$TSTAMP" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep pretrain_gpt_alcf.py | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# SCRIPT_DIR="/lus/grand/projects/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ALCF/" +MODEL_FILE="${ALCF_DIR}/model.sh" +ARGS_FILE="${ALCF_DIR}/args.sh" +LAUNCH_FILE="${ALCF_DIR}/launch.sh" +SETUP_FILE="${ALCF_DIR}/setup.sh" + +sourceFile "${SETUP_FILE}" +sourceFile "${ARGS_FILE}" +sourceFile "${MODEL_FILE}" +sourceFile "${LAUNCH_FILE}" +# if [[ -f "${LAUNCH_FILE}" ]]; then +# echo "source-ing ${LAUNCH_FILE}" +# # shellcheck source=./launch.sh +# source "${LAUNCH_FILE}" +# else +# echo "ERROR: UNABLE TO SOURCE ${LAUNCH_FILE}" +# fi + +setup +elasticDistributed "$@" +wait $! diff --git a/ALCF/pre-AuroraGPT/train-gpt3.sh b/ALCF/pre-AuroraGPT/train-gpt3.sh new file mode 100755 index 0000000000..79e2661d16 --- /dev/null +++ b/ALCF/pre-AuroraGPT/train-gpt3.sh @@ -0,0 +1,72 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep -E "$USER.+mpi.+pretrain_gpt_alcf.py" | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" +sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & diff --git a/ALCF/pre-AuroraGPT/train-llama.sh b/ALCF/pre-AuroraGPT/train-llama.sh new file mode 100755 index 0000000000..6483b55c54 --- /dev/null +++ b/ALCF/pre-AuroraGPT/train-llama.sh @@ -0,0 +1,74 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | egrep "$USER.+mpi.+pretrain.+.py" | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +# sourceFile "${ALCF_DIR}/model.sh" +# sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${ALCF_DIR}/llama2_vars.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & + diff --git a/ALCF/requirements/requirements.txt b/ALCF/requirements/requirements.txt new file mode 100644 index 0000000000..03541ba514 --- /dev/null +++ b/ALCF/requirements/requirements.txt @@ -0,0 +1,18 @@ +hjson +ninja +psutil +py-cpuinfo +pydantic +tqdm +transformers +bitsandbytes +sentencepiece +einops +xgboost +fixedint +pybind11 +six +numpy<2 +schedulefree +packaging>=20.0 +wandb diff --git a/ALCF/sunspot-env-2024-04-15-002.sh b/ALCF/sunspot-env-2024-04-15-002.sh new file mode 100644 index 0000000000..3b7155675d --- /dev/null +++ b/ALCF/sunspot-env-2024-04-15-002.sh @@ -0,0 +1,4 @@ +#!/bin/bash --login + +module use /soft/preview-modulefiles/24.086.0 +module load frameworks/2024.04.15.002.lua diff --git a/ALCF/sunspot-env.sh b/ALCF/sunspot-env.sh new file mode 100644 index 0000000000..8b02542b20 --- /dev/null +++ b/ALCF/sunspot-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash --login +# +module use /home/ftartagl/graphics-compute-runtime/modulefiles +module load graphics-compute-runtime/agama-ci-devel-803.29 +module load spack-pe-gcc/0.6.1-23.275.2 +module load gcc/12.2.0 +module use /soft/preview-modulefiles/24.086.0 +module load oneapi/release/2024.04.15.001 diff --git a/ALCF/test_alcf.sh b/ALCF/test_alcf.sh new file mode 100644 index 0000000000..853addc59d --- /dev/null +++ b/ALCF/test_alcf.sh @@ -0,0 +1,166 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on {Polaris, Sunspot, Sirius} @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_alcf.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +setup_conda_sunspot() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_polaris() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 + # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +function setEnv() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + # setup_conda + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi + fi + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi + fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 + fi + else + echo "Unable to setup python environment. Exiting" + exit 1 + fi + echo "[python] Using: $(which python3)" +} + + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + SUBMITTED_FROM=$(echo $PBS_O_HOST | tr '-' ' ' | awk '{print $1}') + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${SUBMITTED_FROM}/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + # export ZERO_STAGE=1 + # export NUM_LAYERS=10 + # export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-${SUBMITTED_FROM}-${NOW}".log +} + +main + diff --git a/ALCF/test_blend.sh b/ALCF/test_blend.sh new file mode 100755 index 0000000000..9073d2a58c --- /dev/null +++ b/ALCF/test_blend.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/home/hzheng/ALCF-Megatron-DeepSpeed +module load conda/2023-10-04 +#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 +conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/chunks-merge/data_file_list_chunk_1_of_4.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +#MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE +python3 ./test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 80797 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blend_full.sh b/ALCF/test_blend_full.sh new file mode 100755 index 0000000000..459652a2ee --- /dev/null +++ b/ALCF/test_blend_full.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh +export TRITON_CACHE_DIR=/tmp/.cache/ + +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=80797 +ZERO_STAGE=2 +MODEL=LLAMA_7B +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 ${MD}/local_rank.sh python3 ALCF/test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters ${TRAIN_ITERS} \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py new file mode 100644 index 0000000000..c119862142 --- /dev/null +++ b/ALCF/test_blendable_dataset.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +import time +import json +start_time = time.time() +from mpi4py import MPI +import os +from megatron.data.gpt_dataset import build_train_valid_test_datasets +import numpy as np +from megatron.global_vars import set_args, set_global_variables, get_args +from megatron.arguments import parse_args +from megatron.initialize import initialize_megatron +from megatron.data.data_samplers import build_pretraining_data_loader + +import torch +from megatron.core import mpu + + +comm = MPI.COMM_WORLD +from megatron.utils import PerfTrace, Profile + + +import datetime +def print_rank_0(msg): + if comm.rank==0: + print(f" [INFO][{datetime.datetime.now()}] {msg}", flush=True) +end_time = time.time() +print_rank_0(f"Loaded python modules in {end_time - start_time} seconds") +initialize_megatron(allow_no_cuda=True) +comm.Barrier() +print_rank_0(f"Barrier synchonization time: {time.time() - end_time} seconds") +args = get_args() +if os.getenv('DLIO_PROFILER_DATASET_DIR') is not None: + extra_trace_path = os.environ['DLIO_PROFILER_DATASET_DIR'] +else: + extra_trace_path='' +PerfTrace.initialize_log(f"{args.trace_dir}/trace-{comm.rank}-of-{comm.size}.pfw", f"{args.data_cache_path}:{extra_trace_path}:{args.data_path}:{args.save}:{args.load}", process_id=comm.rank) +dlp = Profile("TEST_BLENDABLEDATASET") + +os.makedirs(args.trace_dir, exist_ok=True) + +corpus_all = [] +data_file_list = args.data_file_list +print_rank_0(f"Reading data from {args.data_file_list}") +files = [] +weights = [] +flist = [] +with open(data_file_list, 'r') as fin: + for f in fin.readlines(): + w, fname, c = f.split() + weights.append(float(w)) + flist.append(fname) + files.append(float(w)) + files.append(fname) + files.append(c) + if c not in corpus_all: + corpus_all.append(c) + +splits_string="100,0,0" + +weights = np.array(weights) +weights = weights/np.sum(weights) + +num_samples = args.global_batch_size*args.train_iters +num_datasets = len(weights) +print_rank_0(f"Number of datasets: {num_datasets}") +print_rank_0(f"Global batch size: {args.global_batch_size}") +print_rank_0(f"Training iterations: {args.train_iters}") +train_valid_test_num_samples = [num_samples, 0, 0] +seed=args.seed +data_impl = args.data_impl +skip_warmup = not args.mmap_warmup +seq_length = args.seq_length +splits_string = "1,0,0" + +# Build datasets +start_build_dataset = time.time() + +print_rank_0(f"Starting to build the blendable dataset") +train_ds, valid_ds, test_ds = build_train_valid_test_datasets(files, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, data_cache_path=args.data_cache_path) + + +end_build_dataset = time.time() +print_rank_0(f"Finished building the blendable dataset in {end_build_dataset - start_build_dataset} second") +print_rank_0(f"Total number of samples: {len(train_ds)}") +print_rank_0(f"Weights set: {weights[:min(8, num_datasets)]}") + + +def get_sample_info(blendable_dataset, idx): + # corpus dataset + cd = blendable_dataset.dataset_index[idx] + # index within the corpus dataset + cds = blendable_dataset.dataset_sample_index[idx] + # dataset index within each corpus + fcd = blendable_dataset.datasets[cd].dataset_index[cds] + # sample index within the dataset + fcds = blendable_dataset.datasets[cd].dataset_sample_index[cds] + # corresponding data file + prefix = blendable_dataset.datasets[cd].dataset_builders[fcd].prefix + corpus = blendable_dataset.datasets[cd].dataset_builders[fcd].corpus + #v = blendable_dataset[idx]['text'] + #norm = np.linalg.norm(v) + return prefix, corpus, fcds + +num_batches = args.train_iters +print(f"global_batch_size: {args.global_batch_size}") +print(f"number of batches: {num_batches}") + +fout = open("samples_list.jsonl", "w") +if comm.rank == 0: + for i in range(num_batches): + ns_corpus = {} + for c in corpus_all: + ns_corpus[c] = 0 + for j in range(args.global_batch_size): + prefix, corpus, idx = get_sample_info(train_ds, i*args.global_batch_size+j) + ns_corpus[corpus] +=1 + fout.write(f"\u007b 'batch': {i}, 'sample': {j}, 'corpus': '{corpus}', 'prefix': '{prefix}', 'dataset_sample_index': {idx} \u007d\n") + fout.write(f"\u007b 'batch': {i}, 'histogram': {ns_corpus} \u007d \n") +comm.Barrier() +exit() +start_build_dataloader = time.time() +print_rank_0(f"Starting to build the data loader") +rank_in_parallel_group = mpu.get_sequence_parallel_rank() +train_dataloader = build_pretraining_data_loader( + train_ds, args.consumed_train_samples) +valid_dataloader = build_pretraining_data_loader( + valid_ds, args.consumed_valid_samples) +test_dataloader = build_pretraining_data_loader(test_ds, 0) +end_build_dataloader = time.time() +print_rank_0(f"Finished building the data loader in {end_build_dataloader - start_build_dataloader} second") + +print_rank_0(f"Starting loading the data") +start_loading_time = time.time() +NUM_ITEMS=1 +SLEEP_TIME=10.0 +@dlp.log +def compute(ct): + time.sleep(ct) +n=0 +start_time = time.time() +for i in dlp.iter(train_dataloader): + print(f"[{comm.rank}] DATA {i}") + n+=1 + if (n%NUM_ITEMS==0): + print_rank_0(f"Proccessed {n}th-batch in {time.time() - start_time}") + if n>=1000: + break + start_time = time.time() +end_loading_time = time.time() +print_rank_0(f"Finished loading the data ({n} batches) in {end_loading_time - start_loading_time}") diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh new file mode 100644 index 0000000000..a18c87fad7 --- /dev/null +++ b/ALCF/test_polaris.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Polaris @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_polaris.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-polaris-${NOW}".log +} + +main diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh new file mode 100755 index 0000000000..0a528a9519 --- /dev/null +++ b/ALCF/test_sirius.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sirius @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sirius.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sirius/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log +} + +main diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh new file mode 100755 index 0000000000..b3b22c78b4 --- /dev/null +++ b/ALCF/test_sunspot.sh @@ -0,0 +1,87 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sunspot @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sunspot.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sunspot/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log" +} + +main diff --git a/ALCF/tokenizer.model b/ALCF/tokenizer.model new file mode 100644 index 0000000000..22bccbcb41 Binary files /dev/null and b/ALCF/tokenizer.model differ diff --git a/mds_to_hf.py b/mds_to_hf.py new file mode 100644 index 0000000000..d91513ed8b --- /dev/null +++ b/mds_to_hf.py @@ -0,0 +1,106 @@ +# Usage : python mds_to_hf.py --mds_checkpoint --output_dir --cache-dir /flare/Aurora_deployment/vsastry +# Tips : Do not run on login node. +# This script currently only takes care of tp=1. Takes a AuroraGPT Llama model trained with Megatron-DeepSpeed and converts to LLamaCausalForLM architecture from HuggingFace. + +import argparse +import torch +import os +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + +def repeat_kv_wt(x,np): + return torch.repeat_interleave(x, dim=0, repeats=np) + +def Update_llama_config(Llama_config, mds_args): + if mds_args['swiglu']: + Llama_config.hidden_act = "silu" + Llama_config.hidden_size = mds_args['hidden_size'] + Llama_config.intermediate_size = mds_args['ffn_hidden_size'] + Llama_config.max_position_embeddings = mds_args['max_position_embeddings'] + Llama_config.num_attention_heads = mds_args['num_attention_heads'] + Llama_config.num_hidden_layers = mds_args['num_layers'] + Llama_config.num_key_value_heads = mds_args['num_key_value_heads'] + Llama_config.rms_norm_eps = mds_args['layernorm_epsilon'] + Llama_config.rope_theta = mds_args['rope_theta'] + Llama_config.vocab_size = mds_args['padded_vocab_size'] + if mds_args['fp16'] == True: + Llama_config.torch_dtype = 'float16' + elif mds_args['bf16'] == True: + Llama_config.torch_dtype = 'bfloat16' + return Llama_config + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--mds_checkpoint', required=True) + parser.add_argument('--output_dir', required=True) + parser.add_argument('--cache_dir', required=True) + args = parser.parse_args() + + # make output_dir if it does not exits. + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + filename = str(args.mds_checkpoint) + if not filename.split("/")[-1].startswith('mp_rank') and not filename.split("/")[-1].endswith('.pt'): + assert ("Provide the right file path, The file should be of format mp_rank_*.pt") + print(f"loading mds checkpoint {filename}") + + mds_model = torch.load(args.mds_checkpoint,map_location=torch.device('cpu')) + Llama_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",cache_dir=args.cache_dir) + + Llama_config = Llama_model.config + Updated_Llama_config = Update_llama_config(Llama_config, mds_model['args'].__dict__) + # save the updated config.json file + Updated_Llama_config.to_json_file(os.path.join(args.output_dir,'config.json')) + + state_dict = {} + dim = mds_model['args'].__dict__['kv_channels'] + inv_freq = 1.0 / (mds_model['args'].__dict__['rope_theta'] ** (torch.arange(0,dim, 2).float() / dim)) + hidden_size = mds_model['args'].__dict__['hidden_size'] + kv_dim = mds_model['args'].__dict__['kv_channels'] * mds_model['args'].__dict__['num_key_value_heads'] + kv_groups = mds_model['args'].__dict__['num_attention_heads'] // mds_model['args'].__dict__['num_key_value_heads'] + nkvheads = mds_model['args'].__dict__['num_key_value_heads'] + for layer_i in range(Updated_Llama_config.__dict__['num_hidden_layers']): + # SELF ATTENTION layers. + # get the q, k, v weights separately. Keeping k and v at the GQA head dim, since the transformers/models/llama/modelling_utils will take care of it. + fused_qkv = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.query_key_value.weight"] + fused_reshape = fused_qkv.view(nkvheads,(kv_groups+2)*dim,hidden_size) + ex_q = fused_reshape[:,:kv_groups*dim,:] + con_q = ex_q.contiguous().view(-1, fused_reshape.size(2)) + + ex_k = fused_reshape[:,kv_groups*dim:(kv_groups+1)*dim,:] + con_k = ex_k.contiguous().view(-1, fused_reshape.size(2)) + + ex_v = fused_reshape[:,(kv_groups+1)*dim:(kv_groups+2)*dim,:] + con_v = ex_v.contiguous().view(-1, fused_reshape.size(2)) + + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = con_q + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = con_k + #state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size:hidden_size+kv_dim], kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = con_v + #state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim],kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.dense.weight"] + + # MLP Layers + fused_mlp = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_h_to_4h.weight"] + chunked_mlp = torch.chunk(fused_mlp,2,dim=0) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = chunked_mlp[0] + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = chunked_mlp[1] + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_4h_to_h.weight"] + + #LayerNorm weights and RoPe + state_dict[f"model.layers.{layer_i}.input_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.input_layernorm.weight"] + state_dict[f"model.layers.{layer_i}.post_attention_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.post_attention_layernorm.weight"] + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + + # Get the non-encoder layer weights. + state_dict["model.embed_tokens.weight"] = mds_model['module']['language_model']['embedding']['word_embeddings']['weight'] + state_dict["model.norm.weight"] = mds_model['module']['language_model']['encoder']['final_layernorm.weight'] + state_dict["lm_head.weight"] = mds_model['module']['language_model']['output_layer']['weight'] + + # Save the model in the hf output path. + torch.save(state_dict, os.path.join(args.output_dir,"pytorch_model.bin")) + + + diff --git a/megatron/arguments.py b/megatron/arguments.py index 6580dba80a..9b0e6ccb1a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -780,6 +780,15 @@ def _add_regularization_args(parser): help='Weight decay increment function.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') + group.add_argument('--sophiag-beta1', type=float, default=0.9, + help='First coefficient for computing running averages ' + 'of gradient and its hessian') + group.add_argument('--sophiag-beta2', type=float, default=0.95, + help='Second coefficient for computing running averages ' + 'of gradient and its hessian') + group.add_argument('--sophiag-rho', type=float, default=0.01, + help='SophiaG clipping threshhold') + group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages ' 'of gradient and its square') @@ -818,7 +827,7 @@ def _add_training_args(parser): ' ' ' ' 'For example:' - ' --rampup-batch-size 16 8 300000 \ ' + ' --rampup-batch-size 16 8 300000 \\ ' ' --global-batch-size 1024' 'will start with global batch size 16 and over ' ' (1024 - 16) / 8 = 126 intervals will increase' @@ -896,6 +905,8 @@ def _add_training_args(parser): 'training if SIGTERM is received') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory.') + group.add_argument('--trace-dir', type=str, default="./trace/", + help='Write trace logs to this directory.') group.add_argument('--no-masked-softmax-fusion', action='store_false', help='Disable fusion of query_key_value scaling, ' @@ -937,9 +948,38 @@ def _add_training_args(parser): group.add_argument('--disable-bias-linear', action='store_false', help='Disable bias in the linear layers', dest='add_bias_linear') - group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'sgd'], - help='Optimizer function') + group.add_argument( + '--optimizer', + type=str, + default='adam', + choices=[ + 'adam', + 'adamw', + 'sophiag', + 'sgd', + 'ds.fusedlamb', + 'ipex.lamb', + 'ipex.fusedlamb', + 'apex.adam', + 'apex.sgd', + 'adamwschedulefree', + 'sgdschedulefree', + 'galoreadamw', + 'adam8bit', + 'galoreadamw8bit', + 'galoreadamw8bitperlayer' + ], + help='Optimizer function' + ) + group.add_argument( + "--schedulefree-for-each", + action="store_true", + help=""" + Use a foreach-backed implementation of the schedulefree optimizers. + Should be significantly faster, + but will have a higher peak memory usage. + """, + ) group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], help='Single pass vs multiple pass data loader') @@ -982,6 +1022,19 @@ def _add_training_args(parser): dest='gradient_accumulation_fusion') group.add_argument('--use-dataset-only', type=bool, required=False, default=False, help='If set to True, only use the megatron dataset for external trainer ') + # group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') + group.add_argument( + "--train-range-to-skip", + action="extend", + nargs="+", + type=int, + help=("Range of iters to skip during training. Must be in pairs."), + ) + group.add_argument('--train-iters-to-skip', action="extend", nargs="+", type=str, + help=( + "Specific train iterations to skip when training. " + "Load the data and just perform a noop." + )) return parser @@ -1235,6 +1288,13 @@ def _add_data_args(parser): 'single dataset used for all three: train, valid ' 'and test. It is exclusive to the other ' '--*-data-path args') + group.add_argument('--data-file-list', type=str, default=None, + help='The file with the list of dataset and weights') + + group.add_argument('--shuffle-sample-in-corpus', action='store_true', help="Whether to shuffle the samples within in the dataset files") + + group.add_argument('--blend-sample-in-corpus', action='store_true', help="Whether to blend different files in the same corpus") + group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' @@ -1296,7 +1356,8 @@ def _add_data_args(parser): 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', 'HFTokenizer', - 'NullTokenizer'], + 'NullTokenizer', + 'Llama2Tokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.') @@ -1331,6 +1392,7 @@ def _add_data_args(parser): help='Force to use certain index file.') group.add_argument('--repeated-dataloader', action='store_true', help='Once all the data has been loaded, reuse the DataLoader.') + group.add_argument('--multiprocessing-context', type=str, default='fork') return parser @@ -1487,6 +1549,8 @@ def _add_zero_args(parser): help='Remote device for ZeRO-3 initialized parameters.') group.add_argument('--use-pin-memory', action='store_true', help='Use pinned CPU memory for ZeRO-3 initialized model parameters.') + group.add_argument('--use-mics', action='store_true', + help='Use MiCS') return parser def _add_memoryopt_args(parser): @@ -1531,7 +1595,6 @@ def _add_activation_checkpoint_args(parser): def _add_distillation_args(parser): group = parser.add_argument_group('Knowledge distillation', 'Distillation Configurations') - group.add_argument('--num-layers-teacher', type=int, default=None, help='Number of the teacher transformer layers.') group.add_argument('--num-experts-teacher', type=int, nargs='+', default=[1,], @@ -1540,7 +1603,6 @@ def _add_distillation_args(parser): help='Tansformer teacher hidden size.') group.add_argument('--num-attention-heads-teacher', type=int, default=None, help='Number of teacher transformer attention heads.') - group.add_argument('--mos', action='store_true', help='Enable Mixture-of-Students via knolwedge distillation.') group.add_argument('--kd', action='store_true', @@ -1550,7 +1612,6 @@ def _add_distillation_args(parser): group.add_argument('--kd-temp', default=1.0, type=float) group.add_argument('--reset-iteration', action='store_true', help='Reset the iteration count.') - group.add_argument('--load-teacher', type=str, default=None, help='Directory containing a teacher model checkpoint.') diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 3e4d20035e..a4f82ec9d3 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -8,6 +8,12 @@ import numpy as np from deepspeed.accelerator import get_accelerator import torch +import ezpz as ez +import logging +import torch.distributed as tdist + +import yaml +from pathlib import Path from megatron import update_num_microbatches, get_tokenizer from megatron.core import mpu, tensor_parallel @@ -15,6 +21,7 @@ from .utils import (unwrap_model, print_rank_0, is_rank_0) +from .utils import PerfTrace, Profile from deepspeed.checkpoint import ( ORIGINAL_VOCAB_SIZE, @@ -24,9 +31,15 @@ UNIVERSAL_CHECKPOINT_VERSION_VALUE, ) -_CHECKPOINT_VERSION = None +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +DEVICE = ez.get_torch_device() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") +_CHECKPOINT_VERSION = None +dlp = Profile("CHECKPOINT") def set_checkpoint_version(value): global _CHECKPOINT_VERSION if _CHECKPOINT_VERSION is not None: @@ -155,7 +168,7 @@ def get_checkpoint_tracker_filename(checkpoints_path): training to restart from.""" return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt') - +@dlp.log def read_metadata(tracker_filename): # Read the tracker file and either set the iteration or # mark it as a release checkpoint. @@ -195,7 +208,7 @@ def read_metadata(tracker_filename): max_iter = iteration return max_iter, release - +@dlp.log def get_rng_state(): """ collect rng state across data parallel ranks """ args = get_args() @@ -221,10 +234,16 @@ def get_rng_state(): return rng_state_list - +@dlp.log def save_checkpoint(iteration, model, optimizer, opt_param_scheduler): """Save a model checkpoint.""" args = get_args() + assert args is not None + args_iter = args.iteration + if args_iter != iteration: + log.warning(f"{args.iteration=} != {iteration} passed to 'save_checkpoint'") + + save_lr_state_dict() # Only rank zero of the data parallel writes to the disk. if not args.deepspeed: @@ -322,7 +341,7 @@ def state_dict_for_save_checkpoint_deepspeed(destination=None, prefix='', keep_v if torch.distributed.is_initialized(): torch.distributed.barrier() - +@dlp.log def _transpose_first_dim(t, num_splits, num_splits_first, model): input_shape = t.size() # We use a self_attention module but the values extracted aren't @@ -392,7 +411,7 @@ def fix_query_key_value_ordering(model, checkpoint_version): print_rank_0(" succesfully fixed query-key-values ordering for" " checkpoint version {}".format(checkpoint_version)) - +@dlp.log def _load_base_checkpoint(load_dir, rank0=False): """ Load the base state_dict from the given directory @@ -447,7 +466,7 @@ def _load_base_checkpoint(load_dir, rank0=False): return state_dict, release - +@dlp.log def load_args_from_checkpoint(args, load_arg='load'): """Set required arguments from the checkpoint specified in the arguments. @@ -528,16 +547,82 @@ def _set_arg(arg_name, old_arg_name=None, force=False): _set_arg('num_layers_per_virtual_pipeline_stage') return args, checkpoint_args - -def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True, load_only_weights=False): +@dlp.log +def load_lr_state_dict(strict: bool = False) -> dict: + """Load {iteration, lr} from .yaml file when restoring from checkpoint.""" + args = get_args() + assert args is not None + lr_state_dict_fp = Path(args.load).joinpath( + f"lr_state_dict_{RANK}_of_{WORLD_SIZE}.yaml" + ) + lr_state_dict = {} + if lr_state_dict_fp.is_file(): + with lr_state_dict_fp.open('r') as f: + lr_state_dict = yaml.safe_load(f) + args.lr = lr_state_dict['lr'] + else: + if strict: + raise FileNotFoundError( + f"{lr_state_dict_fp=}.is_file() is False" + ) + log.info( + f"Unable to load lr_state_dict from {lr_state_dict_fp=}, " + f"but strict=False. Returning empty dictionary: {lr_state_dict=}" + ) + return lr_state_dict + +@dlp.log +def save_lr_state_dict() -> None: + """Save {iteration, lr} to .yaml file for safe-keeping. + + Make sure we're only saving from RANK == 0. + """ + if RANK != 0: + return None + args = get_args() + assert args is not None + outdir = getattr(args, 'save', None) + assert outdir is not None + lr_state_dict_fp = Path(args.save).joinpath( + "lr_state_dict.yaml" + ) + log.info(f"Saving lr_state_dict to {lr_state_dict_fp.as_posix()}") + with lr_state_dict_fp.open('w') as f: + yaml.dump( + {'iteration': args.iteration, 'lr': args.lr}, + f + ) + +@dlp.log +def load_checkpoint( + model, + optimizer, + opt_param_scheduler, + load_arg: str = 'load', + strict: bool = True, + load_only_weights: bool = False, + strict_lr_state_dict: bool = False +): """Load a model checkpoint and return the iteration. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint match the names of parameters and buffers in model. """ args = get_args() + assert args is not None load_dir = getattr(args, load_arg) - + lr_state_dict = {} + lr_tensor = torch.tensor(args.lr, requires_grad=False, device=DEVICE) + if RANK == 0: + lr_state_dict = load_lr_state_dict(strict=strict_lr_state_dict) + if len(lr_state_dict.keys()) > 0 and 'lr' in lr_state_dict: + lr_tensor = torch.tensor( + lr_state_dict['lr'], + requires_grad=False, + device=DEVICE, + ) + tdist.broadcast(lr_tensor, 0) + args.lr = lr_tensor.item() if args.deepspeed: if args.finetune: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, @@ -553,7 +638,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri print_rank_0(' will not load any checkpoints and will start from ' 'random') return 0 - release = False + release = False else: model = unwrap_model(model) @@ -729,7 +814,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri return iteration - +@dlp.log def load_biencoder_checkpoint(model, only_query_model=False, only_context_model=False, custom_load_path=None): """ diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py index b23f6c84b3..78e43e7fed 100644 --- a/megatron/core/pipeline_parallel/p2p_communication.py +++ b/megatron/core/pipeline_parallel/p2p_communication.py @@ -16,7 +16,8 @@ from megatron.core import ModelParallelConfig from deepspeed.accelerator import get_accelerator - +from megatron.utils import Profile +dlp = Profile("PIPELINE") # Types Shape = Union[List[int], torch.Size] @@ -329,6 +330,7 @@ def _ring_exchange_wrapper(**kwargs): return tensor_recv_prev, tensor_recv_next, reqs +@dlp.log def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: """ Receive tensor from previous rank in pipeline (forward receive). @@ -353,7 +355,7 @@ def recv_forward(tensor_shape: Shape, config.timers('forward-recv').stop() return input_tensor - +@dlp.log def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: """Receive tensor from next rank in pipeline (backward receive). @@ -376,7 +378,7 @@ def recv_backward(tensor_shape: Shape, config.timers('backward-recv').stop() return output_tensor_grad - +@dlp.log def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None: """Send tensor to next rank in pipeline (forward send). @@ -397,7 +399,7 @@ def send_forward(output_tensor: torch.Tensor, if config.timers is not None: config.timers('forward-send').stop() - +@dlp.log def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None: """Send tensor to previous rank in pipeline (backward send). @@ -417,7 +419,7 @@ def send_backward(input_tensor_grad: torch.Tensor, if config.timers is not None: config.timers('backward-send').stop() - +@dlp.log def send_forward_recv_backward(output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: @@ -441,7 +443,7 @@ def send_forward_recv_backward(output_tensor: torch.Tensor, config.timers('forward-send-backward-recv').stop() return output_tensor_grad - +@dlp.log def send_backward_recv_forward(input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor: @@ -465,7 +467,7 @@ def send_backward_recv_forward(input_tensor_grad: torch.Tensor, config.timers('backward-send-forward-recv').stop() return input_tensor - +@dlp.log def send_forward_recv_forward(output_tensor: torch.Tensor, recv_prev: bool, tensor_shape: Shape, @@ -491,7 +493,7 @@ def send_forward_recv_forward(output_tensor: torch.Tensor, return input_tensor, wait_handles return input_tensor - +@dlp.log def send_backward_recv_backward(input_tensor_grad: torch.Tensor, recv_next: bool, tensor_shape: Shape, @@ -517,7 +519,7 @@ def send_backward_recv_backward(input_tensor_grad: torch.Tensor, return output_tensor_grad, wait_handles return output_tensor_grad - +@dlp.log def send_forward_backward_recv_forward_backward( output_tensor: torch.Tensor, input_tensor_grad: torch.Tensor, diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 407bb16d56..c24959c64f 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -14,10 +14,12 @@ from megatron.core.enums import ModelType from megatron.core.utils import get_attr_wrapped_model, get_model_type, get_model_config -from megatron.utils import unwrap_model +from megatron.utils import print_rank_0, unwrap_model from megatron.model import DistributedDataParallel as LocalDDP from megatron.model import Float16Module +from megatron.utils import Profile +dlp = Profile("CORE") # Types Shape = Union[List[int], torch.Size] @@ -124,6 +126,7 @@ def deallocate_output_tensor(out, deallocate_pipeline_outputs=False): dtype = out.dtype, ) +@dlp.log def custom_backward(output, grad_output): '''Directly call C++ autograd engine. @@ -162,7 +165,7 @@ def custom_backward(output, grad_output): - +@dlp.log def forward_step(forward_step_func, data_iterator, model, @@ -227,8 +230,15 @@ def forward_step(forward_step_func, return output_tensor return [output_tensor] - -def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model=None): +@dlp.log +def backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + model_type, + config, + model=None +): """Backward step through passed-in output tensor. If last stage, output_tensor_grad is None, otherwise gradient of loss @@ -241,12 +251,19 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c # needs to be modified slightly to support arbitrary numbers of skip # connections. args = get_args() - if args.deepspeed: - assert model is not None - + assert args is not None if config.timers is not None: config.timers('backward-compute', log_level=2).start() - + if (to_skip := getattr(args, 'train_iters_to_skip', None)) is not None: + if config.timers is not None: + config.timers('backward-compute').stop() + if len(to_skip) > 0 and args.iteration in [int(i) for i in to_skip]: + print_rank_0( + f'Caught {args.iteration=} in `iters_to_skip`! Skipping!' + ) + return [None] + if args.deepspeed: + assert model is not None # Retain the grad on the input_tensor. unwrap_input_tensor_grad = False if not isinstance(input_tensor, list): @@ -255,24 +272,20 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c for x in input_tensor: if x is not None: x.retain_grad() - if not isinstance(output_tensor, list): output_tensor = [output_tensor] if not isinstance(output_tensor_grad, list): output_tensor_grad = [output_tensor_grad] - # Backward pass. if args.deepspeed: model.backward(output_tensor[0]) else: if output_tensor_grad[0] is None and config.grad_scale_func is not None: output_tensor[0] = config.grad_scale_func(output_tensor[0]) - if config.deallocate_pipeline_outputs: custom_backward(output_tensor[0], output_tensor_grad[0]) else: torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0]) - # Collect the grad of the input_tensor. input_tensor_grad = [None] if input_tensor is not None: @@ -282,7 +295,6 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad.append(None) else: input_tensor_grad.append(x.grad) - # Handle single skip connection if it exists (encoder_hidden_state in # model with encoder and decoder). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and \ @@ -292,13 +304,11 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad[-1].add_(output_tensor_grad[1]) if unwrap_input_tensor_grad: input_tensor_grad = input_tensor_grad[0] - if config.timers is not None: config.timers('backward-compute').stop() - return input_tensor_grad - +@dlp.log def forward_backward_no_pipelining(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], @@ -345,7 +355,7 @@ def forward_backward_no_pipelining(*, forward_data_store = [] input_tensor, output_tensor_grad = None, None with no_sync_func(): - for i in range(num_microbatches - 1): + for i in dlp.iter(range(num_microbatches - 1)): output_tensor = forward_step(forward_step_func, data_iterator, model, num_microbatches, input_tensor, forward_data_store, config, collect_non_loss_data) if not forward_only: @@ -363,7 +373,7 @@ def forward_backward_no_pipelining(*, return forward_data_store - +@dlp.log def forward_backward_pipelining_with_interleaving(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], @@ -916,7 +926,7 @@ def get_tensor_shapes(*, return tensor_shapes - +@dlp.log def recv_forward(tensor_shapes, config): input_tensors = [] for tensor_shape in tensor_shapes: @@ -926,7 +936,7 @@ def recv_forward(tensor_shapes, config): input_tensors.append(p2p_communication.recv_forward(tensor_shape, config)) return input_tensors - +@dlp.log def recv_backward(tensor_shapes, config): output_tensor_grads = [] for tensor_shape in tensor_shapes: @@ -936,7 +946,7 @@ def recv_backward(tensor_shapes, config): output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config)) return output_tensor_grads - +@dlp.log def send_forward(output_tensors, tensor_shapes, config): if not isinstance(output_tensors, list): output_tensors = [output_tensors] @@ -945,7 +955,7 @@ def send_forward(output_tensors, tensor_shapes, config): continue p2p_communication.send_forward(output_tensor, config) - +@dlp.log def send_backward(input_tensor_grads, tensor_shapes, config): if not isinstance(input_tensor_grads, list): input_tensor_grads = [input_tensor_grads] @@ -954,7 +964,7 @@ def send_backward(input_tensor_grads, tensor_shapes, config): continue p2p_communication.send_backward(input_tensor_grad, config) - +@dlp.log def send_forward_recv_backward(output_tensors, tensor_shapes, config): if not isinstance(output_tensors, list): output_tensors = [output_tensors] @@ -968,7 +978,7 @@ def send_forward_recv_backward(output_tensors, tensor_shapes, config): output_tensor_grads.append(output_tensor_grad) return output_tensor_grads - +@dlp.log def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config): if not isinstance(input_tensor_grads, list): input_tensor_grads = [input_tensor_grads] @@ -982,7 +992,7 @@ def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config): input_tensors.append(input_tensor) return input_tensors - +@dlp.log def forward_backward_pipelining_without_interleaving(*, forward_step_func, data_iterator: Union[Iterator, List[Iterator]], diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py index 9dcdc0459f..d0453d25ea 100644 --- a/megatron/core/tensor_parallel/cross_entropy.py +++ b/megatron/core/tensor_parallel/cross_entropy.py @@ -69,14 +69,14 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): vocab_size = exp_logits.size(-1) if label_smoothing > 0: - """ + r""" We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth. = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt}) = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K - From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py + From: """ assert 1.0 > label_smoothing > 0.0 smoothing = label_smoothing * vocab_size / (vocab_size - 1) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py old mode 100644 new mode 100755 index 590375a0f2..ab164fdc48 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -6,46 +6,30 @@ import os import time +import logging import numpy as np import torch from deepspeed.accelerator import get_accelerator -from megatron import print_rank_0 +# from megatron import print_rank_0 from megatron.core import mpu +from megatron.utils import Profile, PerfTrace +from mpi4py import MPI +from megatron.utils import get_logger +log = get_logger(__name__, rank_zero_only=True) + +dlp = Profile("DATASET") class BlendableDataset(torch.utils.data.Dataset): + @dlp.log + def __init__(self, datasets, weights, size, *, + data_cache_path=None): - def __init__( - self, - datasets, - weights, - size, - *, - data_cache_path=None - ): self.datasets = datasets num_datasets = len(datasets) - ndsets = np.array([num_datasets], dtype=np.int64).item() - len_weights = np.array([len(weights)], dtype=np.int64).item() - print_rank_0(f'{len(datasets)=}') - print_rank_0(f'{len(weights)}') - print_rank_0(f'{ndsets=}') - print_rank_0(f'{len_weights=}') - # if int(num_datasets) != int(len(weights)): - # if f'{num_datasets}' == f'{len(weights)}': - if ndsets != len_weights: - warr = np.array(weights) - print_rank_0('\n'.join([ - f'{num_datasets=}', - f'{len(weights)=}', - f'{warr.shape=}', - f'{warr.sum()=}', - # f'{num_datasets=} != {len(warr)=}', - ])) assert num_datasets == len(weights) - # else: - # raise IndexError(f'{num_datasets=} != {len(weights)=}') + self.size = size # Normalize weights. @@ -55,9 +39,9 @@ def __init__( weights /= sum_weights # Build indicies. + @dlp.log def _build_indices(): - start_time = time.time() - # assert num_datasets < 255 + start_time = time.perf_counter() dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) @@ -65,8 +49,10 @@ def _build_indices(): helpers.build_blending_indices(dataset_index, dataset_sample_index, weights, num_datasets, self.size, torch.distributed.get_rank() == 0) - print_rank_0('> elapsed time for building blendable dataset indices: ' - '{:.2f} (sec)'.format(time.time() - start_time)) + log.info( + "> elapsed time for building blendable dataset indices: " + f"{time.perf_counter() - start_time:.2f} (sec)" + ) return dataset_index, dataset_sample_index desc = "Blendable dataset\n\n" @@ -76,7 +62,8 @@ def _build_indices(): desc += f"Weights: {weights}\n" desc += f"Size: {size}\n" self.desc = desc - + self.dataset_index = np.zeros(self.size, dtype=np.int64) + self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) if data_cache_path: desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest() desc_path = os.path.join(data_cache_path, desc_hash + ".dsc") @@ -89,47 +76,50 @@ def _build_indices(): ' dataset, building indices on rank 0 ...', flush=True) dataset_index, dataset_sample_index = _build_indices() try: + log.debug(" > saving index map files") + start_time = time.perf_counter() os.makedirs(os.path.dirname(index_path), exist_ok=True) with open(desc_path, 'wt') as fd: fd.write(desc) np.save(index_path, dataset_index, allow_pickle=True) np.save(sample_index_path, dataset_sample_index, allow_pickle=True) + log.info(f" > finished saving index map files in {time.perf_counter() - start_time} seconds") except OSError: print(f'There was an error trying to create the data cache directory ({data_cache_path})') print('or a file in it. This is set with the --data-cache-path argument. Please') print('ensure you have write access to this directory or specify one that you do have') print('write access to.') cache_success = False - - + self.dataset_index = dataset_index + self.dataset_sample_index = dataset_sample_index + ''' I don't think the following piece of code is necessary any more; I commented them out now counts = get_accelerator().LongTensor([cache_success]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // - torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): - print_rank_0("Data index creation unsuccessful, exiting.") + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // + torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): + log.info("Data index creation unsuccessful, exiting.") exit() - - # Load on all ranks. - print_rank_0(f'> loading blendable dataset index: {index_path}') - self.dataset_index = np.load( - index_path, - allow_pickle=True, - mmap_mode='r' - ) + ''' + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + + start_time = time.perf_counter() + log.info(f'> loading blendable dataset index: {index_path}') + self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_index.size == self.size - - print_rank_0( - f'> loading blendable dataset sample index: ' - f'{sample_index_path}' - ) + log.info(f'> loading blendable dataset sample index: {sample_index_path}') self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_sample_index.size == self.size + log.info(f'> finished loading in {time.perf_counter() - start_time} seconds') else: self.dataset_index, self.dataset_sample_index = _build_indices() + + # Check size _ = self.__getitem__(self.size - 1) try: @@ -137,16 +127,18 @@ def _build_indices(): raise RuntimeError('BlendedDataset size is improperly bounded') except IndexError: pass - print_rank_0('> size of blendable dataset: ' + log.info('> size of blendable dataset: ' '{} samples'.format(self.size)) + def __len__(self): return self.size + @dlp.log def __getitem__(self, idx): dataset_idx = self.dataset_index[idx] sample_idx = self.dataset_sample_index[idx] return { - "dataset_idx": dataset_idx, + "dataset_idx" : dataset_idx, **self.datasets[dataset_idx][sample_idx], } diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 2d7da67e15..b242101b3a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,7 +2,6 @@ """Dataloaders.""" - import random import torch import numpy as np @@ -41,38 +40,55 @@ def build_pretraining_data_loader(dataset, consumed_samples): args.dataloader_type)) # Torch dataloader. - loader = torch.utils.data.DataLoader(dataset, - batch_sampler=batch_sampler, - num_workers=args.num_workers, - pin_memory=True) + loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + multiprocessing_context=( + args.multiprocessing_context if args.num_workers > 0 + else None + ) + ) if args.repeated_dataloader: loader=RepeatingLoader(loader) return loader class MegatronPretrainingSampler: - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): + def __init__( + self, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + drop_last=True + ): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size + ) self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) + assert self.consumed_samples < self.total_samples, ( + 'no samples left to consume: ' + f'{self.consumed_samples}, {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -122,8 +138,16 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: - def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, data_sharding): + def __init__( + self, + dataset, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + data_sharding + ): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -132,19 +156,23 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size - self.last_batch_size = \ + ) + self.last_batch_size = ( self.total_samples % self.micro_batch_times_data_parallel_size + ) # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -160,23 +188,31 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_times_data_parallel_size + ) + ) bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + full_bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_size + ) + ) full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = \ + idx_range_total = ( torch.randperm(full_bucket_size, generator=g).tolist() + ) idx_range_active = idx_range_total[full_bucket_offset:] idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] @@ -187,4 +223,4 @@ def __iter__(self): if len(batch) == self.micro_batch_size: self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch - batch = [] \ No newline at end of file + batch = [] diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index cadca053cf..61dd3909e4 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -40,6 +40,18 @@ DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5] +def get_datasets_corpuses_weights_and_num_samples(data_prefix, train_valid_test_num_samples): + assert len(data_prefix) % 3 == 0 + num_datasets = len(data_prefix) // 3 + data_new_prefix = [] + corpuses = [] + for i in range(num_datasets): + data_new_prefix += [data_prefix[3*i], data_prefix[3*i+1]] + corpuses.append(data_prefix[3*i+2]) + prefixes, weights, datasets_train_valid_test_num_samples = \ + get_datasets_weights_and_num_samples(data_new_prefix, + train_valid_test_num_samples) + return prefixes, corpuses, weights, datasets_train_valid_test_num_samples def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples): diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index f9cf5ee6f7..d09f08d63a 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -9,26 +9,45 @@ import numpy as np import torch from deepspeed.accelerator import get_accelerator -from megatron import print_rank_0, is_rank_0, get_args +from megatron import is_rank_0, get_args from megatron.core import mpu +from megatron.data import helpers # type:ignore from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import ( + get_datasets_weights_and_num_samples, + get_datasets_corpuses_weights_and_num_samples, +) from megatron.data.dataset_utils import get_train_valid_test_split_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - train_data_prefix=None, - valid_data_prefix=None, - test_data_prefix=None, - return_doc_ids=False, *, - data_cache_path=None): +from megatron.utils import PerfTrace, Profile, get_logger +from mpi4py import MPI + +dlp = Profile("DATASET") + +log = get_logger(__name__, rank_zero_only=True) + + +@dlp.log +def build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + train_data_prefix=None, + valid_data_prefix=None, + test_data_prefix=None, + return_doc_ids=False, + *, + data_cache_path=None, +): """Build train, valid, and test datasets.""" if data_prefix: - print_rank_0("Single data path provided for train, valid & test") + log.debug("Single data path provided for train, valid & test") # Single dataset. if len(data_prefix) == 1: @@ -40,72 +59,289 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, seq_length, seed, skip_warmup, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) # Blending dataset. # Parse the values. - output = get_datasets_weights_and_num_samples( - data_prefix, - train_valid_test_num_samples + output = get_datasets_corpuses_weights_and_num_samples( + data_prefix, train_valid_test_num_samples ) - prefixes, weights, datasets_train_valid_test_num_samples = output + prefixes, corpuses, weights, datasets_train_valid_test_num_samples = output + corpus_list = sorted(set(corpuses)) train_num_samples, valid_num_samples, test_num_samples = map( - sum, - zip(*datasets_train_valid_test_num_samples) + sum, zip(*datasets_train_valid_test_num_samples) ) - # Build individual datasets. + class DatasetBuilder: + """ + This is for building individual dataset from each dataset file + """ + + @dlp.log + def __init__( + self, + prefix, + corpus, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + return_doc_ids, + data_cache_path=data_cache_path, + name="train", + ): + self.prefix = prefix + self.data_impl = data_impl + self.splits_string = splits_string + if name == "train": + self.num_samples = num_samples[0] + elif name == "valid": + self.num_samples = num_samples[1] + else: + self.num_samples = num_samples[2] + self.num_samples_train_valid_test = num_samples + self.seq_length = seq_length + self.seed = seed + self.skip_warmup = skip_warmup + self.return_doc_ids = return_doc_ids + self.data_cache_path = data_cache_path + self.dataset = None + self.name = name + self.desc = prefix + f"{self.num_samples}" + f"{seq_length}" + f"{seed}" + self.build = False + self.corpus = corpus + + @dlp.log + def Build(self): + self.dataset = _build_train_valid_test_datasets_single( + self.prefix, + self.data_impl, + self.splits_string, + self.num_samples_train_valid_test, + self.seq_length, + self.seed, + self.skip_warmup, + self.name, + self.return_doc_ids, + data_cache_path=self.data_cache_path, + ) + self.build = True + return self.dataset + + class BuildCorpusDataset(torch.utils.data.Dataset): + @dlp.log + def __init__(self, dataset_builders): + self.dataset_builders = dataset_builders + self.num_datasets = len(dataset_builders) + self.num_samples = np.sum([d.num_samples for d in dataset_builders]) + self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64) + self.desc = "CorpusDataset:" + # m = 0 + num_samples_list = np.array([d.num_samples for d in dataset_builders]) + self.num_samples = np.sum(num_samples_list) + args = get_args() + + @dlp.log + def _build_indices_blended(): + start_time = time.time() + dataset_index = np.zeros(self.num_samples, dtype=np.int64) + dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64) + weights = num_samples_list / self.num_samples + helpers.build_blending_indices( + dataset_index, dataset_sample_index, + weights, self.num_datasets, self.num_samples, + torch.distributed.get_rank() == 0) + log.debug(f"> elapsed time for building blendable dataset indices for corpus {self.dataset_builders[0].corpus}: " + "{:.2f} (sec)".format(time.time() - start_time)) + return dataset_index, dataset_sample_index + + + def _build_indices_concat(): + start_time = time.time() + dataset_index = np.zeros(self.num_samples, dtype=np.int64) + dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64) + helpers.build_concat_indices( + dataset_index, + dataset_sample_index, + num_samples_list, + self.num_datasets, + torch.distributed.get_rank() == 0, + ) + log.debug( + "> elapsed time for building concat dataset indices: " + "{:.2f} (sec)".format(time.time() - start_time) + ) + return dataset_index, dataset_sample_index + + if args.blend_sample_in_corpus: + self.dataset_index, self.dataset_sample_index = _build_indices_blended() + else: + self.dataset_index, self.dataset_sample_index = _build_indices_concat() + + np_rng = np.random.RandomState(seed=dataset_builders[0].seed) + self.shuffle_index = np.arange(self.num_samples) + if args.shuffle_sample_in_corpus: + np_rng.shuffle(self.shuffle_index) + for i in range(self.num_datasets): + self.desc += dataset_builders[i].prefix + "," + + log.info( + f"[BuildConcatDataset] Caught {args.shuffle_sample_in_corpus=} across" + f" {self.num_samples} samples" + ) + self.desc += ( + f"-{self.num_samples}" + + f"-{dataset_builders[0].seq_length}" + + f"{dataset_builders[0].seed}" + ) + + def __len__(self): + return self.num_samples + + @dlp.log + def __getitem__(self, idx): + id_shuffle = self.shuffle_index[idx] + i = self.dataset_index[id_shuffle] + j = self.dataset_sample_index[id_shuffle] + if self.dataset_builders[i].build: + return self.dataset_builders[i].dataset[j] + else: + return self.dataset_builders[i].Build()[j] + + # Predetermine whether need to build the specific dataset or not. + start_time = time.time() + log.debug(" >>> Started building datasets in distributed way ... ") + + a, b, c = [int(d) for d in splits_string.split(",")] + train_datasets = [] valid_datasets = [] test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup, - return_doc_ids, - data_cache_path=data_cache_path) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) + # Build individual datasets. + args = get_args() + @dlp.log + def build_corpus_datasets(dataset_type="train"): + start_time = time.time() + log.debug(f" >>> Building {dataset_type} corpus datasets ...") + datasets = [] + corpus_builders = {} + corpus_weights = {} + for c in corpus_list: + corpus_builders[c] = [] + corpus_weights[c] = 0.0 + dataset_builders = [ + DatasetBuilder( + prefixes[i], + corpuses[i], + data_impl, + splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, + skip_warmup, + return_doc_ids, + data_cache_path, + dataset_type, + ) + for i in range(len(weights)) + ] + for i in range( + torch.distributed.get_rank() + // mpu.get_tensor_model_parallel_world_size(), + len(weights), + torch.distributed.get_world_size() + // mpu.get_tensor_model_parallel_world_size(), + ): + dataset_builders[i].Build() + log.debug( + f" >>> Finished building individual datasets in {time.time() - start_time} seconds" + ) + start_concating_time = time.time() + for i, d in zip(range(len(weights)), dataset_builders): + corpus_builders[d.corpus].append(d) + corpus_weights[d.corpus] += weights[i] + total = 0 + log.debug(" > number of samples for each corpus ") + corpus_weights_achieved = {} + for c in corpus_list: + datasets.append(BuildCorpusDataset(corpus_builders[c])) + total += datasets[-1].num_samples + corpus_weights_achieved[c] = ( + float(datasets[-1].num_samples) / train_num_samples + ) + log.debug( + f" {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})" + ) + log.debug(f" > total number of samples: {total}") + log.debug( + f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds" + ) + log.debug( + f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds" + ) + return datasets, [corpus_weights_achieved[c] for c in corpus_list] + + train_weights = None + if a > 0: + train_datasets, train_weights = build_corpus_datasets("train") + valid_weights = None + if b > 0: + valid_datasets, valid_weights = build_corpus_datasets("valid") + test_weights = None + if c > 0: + test_datasets, test_weights = build_corpus_datasets("test") + + # This barrier is critical to make sure that all the datasets are built once + # and the metadata were written to the cache folder before other ranks touch them + log.debug( + f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds" + ) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + log.debug( + f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds" + ) + log.debug(" >>> Starting to build BlendableDataset") # Blend. + start_time = time.time() blending_train_dataset = None - if train_datasets: + if train_datasets and train_weights: blending_train_dataset = BlendableDataset( train_datasets, - weights, + train_weights, train_num_samples, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) blending_valid_dataset = None - if valid_datasets: + if valid_datasets and valid_weights: blending_valid_dataset = BlendableDataset( valid_datasets, - weights, + valid_weights, valid_num_samples, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) blending_test_dataset = None - if test_datasets: + if test_datasets and test_weights: blending_test_dataset = BlendableDataset( test_datasets, - weights, + test_weights, test_num_samples, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) - - return (blending_train_dataset, blending_valid_dataset, - blending_test_dataset) + end_time = time.time() + log.debug( + f" >>> Finished building BlendableDataset in {end_time - start_time} seconds" + ) + return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) else: - print_rank_0( - "Separate data paths provided for train, valid & test. " - "Split string will be ignored." + log.debug( + "Separate data paths provided for train, valid & test. Split string will be ignored." ) train_dataset, valid_dataset, test_dataset = None, None, None @@ -117,8 +353,10 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, splits_string, train_valid_test_num_samples[0], - seq_length, seed, skip_warmup, - data_cache_path=data_cache_path + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, ) if valid_data_prefix is not None: @@ -131,7 +369,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, seq_length, seed, False, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) if test_data_prefix is not None: @@ -144,77 +382,168 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, seq_length, seed, False, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) return (train_dataset, valid_dataset, test_dataset) +@dlp.log def _build_train_valid_test_datasets( - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - seq_length, - seed, - skip_warmup, - return_doc_ids=False, - *, - data_cache_path=None + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + return_doc_ids=False, + *, + data_cache_path=None, ): """Build train, valid, and test datasets.""" # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) total_num_of_documents = indexed_dataset.sizes.shape[0] splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. - print_rank_0(' > dataset split:') + log.debug(" > dataset split:") def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) + log.debug(" {}:".format(name)) + log.debug( + " document indices in [{}, {}) total of {} " "documents".format( + splits[index], splits[index + 1], splits[index + 1] - splits[index] + ) + ) + + print_split_stats("train", 0) + print_split_stats("validation", 1) + print_split_stats("test", 2) def build_dataset(index, name): dataset = None if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = GPTDataset(name, data_prefix, documents, indexed_dataset, - splits_string, - train_valid_test_num_samples[index], - seq_length, seed, - return_doc_ids, - data_cache_path=data_cache_path) + documents = np.arange( + start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 + ) + dataset = GPTDataset( + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + train_valid_test_num_samples[index], + seq_length, + seed, + return_doc_ids, + data_cache_path=data_cache_path, + ) return dataset - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') + train_dataset = build_dataset(0, "train") + valid_dataset = build_dataset(1, "valid") + test_dataset = build_dataset(2, "test") return (train_dataset, valid_dataset, test_dataset) -def build_dataset(dataset_name, data_prefix, data_impl, - splits_string, num_samples, - seq_length, seed, skip_warmup, - *, - data_cache_path=None): +@dlp.log +def _build_train_valid_test_datasets_single( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + name, + return_doc_ids=False, + *, + data_cache_path=None, +): + """Build train, valid, and test datasets.""" + + # Each rank print out information + log.debug(f" >> building dataset for {data_prefix}") + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + + # Print stats about the splits. + log.debug(" > dataset split:") + + def print_split_stats(name, index): + log.debug(" {}:".format(name)) + log.debug( + " document indices in [{}, {}) total of {} " "documents".format( + splits[index], splits[index + 1], splits[index + 1] - splits[index] + ) + ) + + print_split_stats("train", 0) + print_split_stats("validation", 1) + print_split_stats("test", 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange( + start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 + ) + dataset = GPTDataset( + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + train_valid_test_num_samples[index], + seq_length, + seed, + return_doc_ids, + data_cache_path=data_cache_path, + ) + return dataset + + if name.find("train") != -1: + return build_dataset(0, "train") + if name.find("valid") != -1: + return build_dataset(1, "valid") + if name.find("test") != -1: + return build_dataset(2, "test") + + +@dlp.log +def build_dataset( + dataset_name, + data_prefix, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + *, + data_cache_path=None, +): dataset = None if len(data_prefix) == 1: - dataset = _build_dataset(dataset_name, data_prefix[0], data_impl, - splits_string, num_samples, seq_length, - seed, skip_warmup, - data_cache_path=data_cache_path) + dataset = _build_dataset( + dataset_name, + data_prefix[0], + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) else: # Blending dataset. # Parse the values. @@ -225,31 +554,40 @@ def build_dataset(dataset_name, data_prefix, data_impl, # Build individual datasets. datasets = [] for i in range(len(prefixes)): - ds = _build_dataset(dataset_name, prefixes[i], data_impl, - splits_string, dataset_num_samples[i], - seq_length, seed, skip_warmup, - data_cache_path=data_cache_path) + ds = _build_dataset( + dataset_name, + prefixes[i], + data_impl, + splits_string, + dataset_num_samples[i], + seq_length, + seed, + skip_warmup, + data_cache_path=data_cache_path, + ) if ds: datasets.append(ds) if datasets: - dataset = BlendableDataset(datasets, weights, num_samples, - data_cache_path=data_cache_path) + dataset = BlendableDataset( + datasets, weights, num_samples, data_cache_path=data_cache_path + ) return dataset +@dlp.log def _build_dataset( - dataset_name, - data_prefix, - data_impl, - splits_string, - num_samples, - seq_length, - seed, - skip_warmup, - *, - data_cache_path=None + dataset_name, + data_prefix, + data_impl, + splits_string, + num_samples, + seq_length, + seed, + skip_warmup, + *, + data_cache_path=None, ): """ Build dataset. This method is called when individual @@ -257,30 +595,20 @@ def _build_dataset( """ # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) total_num_of_documents = indexed_dataset.sizes.shape[0] - print_rank_0(f' {dataset_name}:') - print_rank_0( - f' ' - f'document indices in [0, {total_num_of_documents}) ' - f'total of {total_num_of_documents} documents' - ) - # 'documents'.format( - # total_num_of_documents, - # total_num_of_documents - # )) - - documents = np.arange( - start=0, - stop=total_num_of_documents, - step=1, - # dtype=np.int32 + log.debug(" {}:".format(dataset_name)) + log.debug( + " document indices in [0, {}) total of {} " "documents".format( + total_num_of_documents, total_num_of_documents + ) ) - return GPTDataset( + + documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32) + + dataset = GPTDataset( dataset_name, data_prefix, documents, @@ -289,33 +617,45 @@ def _build_dataset( num_samples, seq_length, seed, - data_cache_path=data_cache_path + data_cache_path=data_cache_path, ) + return dataset + +@dlp.log def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') + log.debug(" > building dataset index ...") start_time = time.time() - indexed_dataset = make_indexed_dataset(data_prefix, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) + indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) + log.debug( + " > finished creating indexed dataset in {:4f} " "seconds".format( + time.time() - start_time + ) + ) + log.debug(" number of documents: {}".format(indexed_dataset.sizes.shape[0])) return indexed_dataset class GPTDataset(torch.utils.data.Dataset): - - def __init__(self, name, data_prefix, documents, indexed_dataset, - splits_string, num_samples, seq_length, seed, - return_doc_ids=False, *, - data_cache_path=None): - + @dlp.log + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + splits_string, + num_samples, + seq_length, + seed, + return_doc_ids=False, + *, + data_cache_path=None, + ): self.name = name self.indexed_dataset = indexed_dataset self.return_doc_ids = return_doc_ids @@ -325,20 +665,29 @@ def __init__(self, name, data_prefix, documents, indexed_dataset, assert np.max(documents) < indexed_dataset.sizes.shape[0] # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \ - _build_index_mappings(self.name, data_prefix, - documents, self.indexed_dataset.sizes, - splits_string, num_samples, seq_length, seed, - data_cache_path=data_cache_path) - + self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = ( + _build_index_mappings( + self.name, + data_prefix, + documents, + self.indexed_dataset.sizes, + splits_string, + num_samples, + seq_length, + seed, + data_cache_path=data_cache_path, + ) + ) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) return self.sample_idx.shape[0] - 1 + @dlp.log def __getitem__(self, idx): args = get_args() + assert args is not None orig_idx = idx # Get the shuffled index. try: @@ -347,21 +696,24 @@ def __getitem__(self, idx): if is_rank_0(): import json from rich import print_json + print(exc) print( - '\n'.join( - ['-------------------------------------------------', - f'Trying to access {idx=} from self.shuffle_idx,', - f'but {len(self.shuffle_idx)=}', - '-------------------------------------------------'] + "\n".join( + [ + "-------------------------------------------------", + f"Trying to access {idx=} from self.shuffle_idx,", + f"but {len(self.shuffle_idx)=}", + "-------------------------------------------------", + ] ) ) print_json( json.dumps( { - 'doc_idx': len(self.doc_idx), - 'sample_idx': len(self.sample_idx), - 'shuffle_idx': len(self.shuffle_idx), + "doc_idx": len(self.doc_idx), + "sample_idx": len(self.sample_idx), + "shuffle_idx": len(self.shuffle_idx), }, indent=4, ) @@ -378,44 +730,54 @@ def __getitem__(self, idx): sample = self.indexed_dataset.get( self.doc_idx[doc_index_f], offset=offset_f, - length=offset_l - offset_f + 1 + length=offset_l - offset_f + 1, ) else: # Otherwise, get the rest of the initial document. doc_ids.append(self.doc_idx[doc_index_f]) - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] + sample_list = [ + self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f) + ] # Loop over all in between documents and add the entire document. for i in range(doc_index_f + 1, doc_index_l): doc_ids.append(self.doc_idx[i]) sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) # And finally add the relevant portion of last document. doc_ids.append(self.doc_idx[doc_index_l]) - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) + sample_list.append( + self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) + ) sample = np.concatenate(sample_list) - text_name = 'text' + text_name = "text" if args.use_dataset_only: - text_name = 'input_ids' + text_name = "input_ids" sample_dict = {text_name: np.array(sample, dtype=np.int64)} if args.return_data_index: - sample_dict.update({'index': np.array([orig_idx], dtype=np.int64)}) + sample_dict.update({"index": np.array([orig_idx], dtype=np.int64)}) - if self.return_doc_ids: # for retro preprocessing - sample_dict.update({'doc_ids': np.array(doc_ids, dtype=np.int64)}) + if self.return_doc_ids: # for retro preprocessing + sample_dict.update({"doc_ids": np.array(doc_ids, dtype=np.int64)}) if args.use_dataset_only: - sample_dict.update({'labels': np.array(sample, dtype=np.int64)}) + sample_dict.update({"labels": np.array(sample, dtype=np.int64)}) return sample_dict -def _build_index_mappings(name, data_prefix, documents, sizes, - splits_string, num_samples, seq_length, seed, - *, - data_cache_path): +@dlp.log +def _build_index_mappings( + name, + data_prefix, + documents, + sizes, + splits_string, + num_samples, + seq_length, + seed, + *, + data_cache_path, +): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each @@ -423,10 +785,11 @@ def _build_index_mappings(name, data_prefix, documents, sizes, shuffle-idx: maps the sample index into a random index into sample-idx. """ args = get_args() + assert args is not None # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) - if args.train_data_exact_num_epochs is not None and name == 'train': + if args.train_data_exact_num_epochs is not None and name == "train": num_epochs = args.train_data_exact_num_epochs # rng state @@ -441,13 +804,13 @@ def _build_index_mappings(name, data_prefix, documents, sizes, desc += f"Sequence length {seq_length}\n" desc += f"Random seed {seed}\n" desc += f"Split {splits_string}\n" - desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest() + desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest() desc_filename = desc_hash + ".dsc" - doc_idx_filename = desc_hash + '_doc_idx.npy' - sample_idx_filename = desc_hash + '_sample_idx.npy' - shuffle_idx_filename = desc_hash + '_shuffle_idx.npy' + doc_idx_filename = desc_hash + "_doc_idx.npy" + sample_idx_filename = desc_hash + "_sample_idx.npy" + shuffle_idx_filename = desc_hash + "_shuffle_idx.npy" - if name == 'train': + if name == "train": # force to use certain index files if args.train_desc_path is not None: desc_filename = args.train_desc_path @@ -462,15 +825,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # duplication, then look in data-cache-path if specified, # If nothing is found, use the last path looked in build_indices = True - prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')] + prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")] if data_cache_path is not None: prefixes.append(data_cache_path) for prefix in prefixes: idx_path = { - 'desc': os.path.join(prefix, desc_filename), - 'doc': os.path.join(prefix, doc_idx_filename), - 'sample': os.path.join(prefix, sample_idx_filename), - 'shuffle': os.path.join(prefix, shuffle_idx_filename) + "desc": os.path.join(prefix, desc_filename), + "doc": os.path.join(prefix, doc_idx_filename), + "sample": os.path.join(prefix, sample_idx_filename), + "shuffle": os.path.join(prefix, shuffle_idx_filename), } for f in idx_path.values(): if not os.path.isfile(f): @@ -479,13 +842,17 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # Found our files! build_indices = False break - data_cache_dir = os.path.dirname(idx_path['desc']) + data_cache_dir = os.path.dirname(idx_path["desc"]) data_cache_success = True # Build the indexed mapping if not exist. - if build_indices and is_rank_0(): - print_rank_0(' > WARNING: could not find index map files, building ' - 'the indices on rank 0 ...') + if build_indices: + # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the + # ranks will first create the document files, and then read it. + # There will not be contension effects going on either + log.warning( + f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}" + ) # For the last epoch, decide whether include the entire epoch # in the global shuffle or not. @@ -494,64 +861,80 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # not mean anything. if num_epochs == 1: separate_last_epoch = False - print(' > only one epoch required, setting ' - 'separate_last_epoch to False', flush=True) + log.debug( + " > only one epoch required, setting " "separate_last_epoch to False" + ) else: # Get the number of samples for the last epoch num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - 1) // seq_length - last_epoch_num_samples = num_samples - \ - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, \ - 'last epoch number of samples should be non-negative.' + (num_epochs - 1) * tokens_per_epoch - 1 + ) // seq_length + last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one + assert ( + last_epoch_num_samples >= 0 + ), "last epoch number of samples should be non-negative." num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length - assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \ - 'last epoch number of samples exceeded max value.' + assert last_epoch_num_samples <= ( + num_samples_per_epoch + 1 + ), "last epoch number of samples exceeded max value." # If we have less than 80% of the samples for the last epoch, # seperate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. - separate_last_epoch = (last_epoch_num_samples < - int(0.80 * num_samples_per_epoch)) + separate_last_epoch = last_epoch_num_samples < int( + 0.80 * num_samples_per_epoch + ) if separate_last_epoch: - string = ' > last epoch number of samples ({}) is smaller '\ - 'than 80% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to True' + string = ( + " > last epoch number of samples ({}) is smaller " + "than 80% of number of samples per epoch ({}), " + "setting separate_last_epoch to True" + ) else: - string = ' > last epoch number of samples ({}) is larger '\ - 'than 80% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to False' - print(string.format(last_epoch_num_samples, - num_samples_per_epoch), flush=True) - + string = ( + " > last epoch number of samples ({}) is larger " + "than 80% of number of samples per epoch ({}), " + "setting separate_last_epoch to False" + ) + log.debug(string.format(last_epoch_num_samples, num_samples_per_epoch)) try: os.makedirs(data_cache_dir, exist_ok=True) # description - with open(idx_path['desc'], 'wt') as fd: + with open(idx_path["desc"], "wt") as fd: fd.write(desc) # doc-idx. start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, - separate_last_epoch) - np.save(idx_path['doc'], doc_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) + np.save(idx_path["doc"], doc_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save doc-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) + ) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data import helpers + assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 - sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch) - np.save(idx_path['sample'], sample_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) + sample_idx = helpers.build_sample_idx( + sizes, + doc_idx, + seq_length, + num_epochs, + tokens_per_epoch, + torch.distributed.get_rank() == 0, + ) + np.save(idx_path["sample"], sample_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save sample-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) + ) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: @@ -560,45 +943,46 @@ def _build_index_mappings(name, data_prefix, documents, sizes, num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, - sample_idx.shape[0] - 1, np_rng) - np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time)) + shuffle_idx = _build_shuffle_idx( + num_samples_, sample_idx.shape[0] - 1, np_rng + ) + np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True) + log.debug( + " > elasped time to build and save shuffle-idx mapping" + " (seconds): {:4f}".format(time.time() - start_time) + ) except OSError: - print(f'There was an error trying to create the data cache directory ({data_cache_dir})') - print('or a file in it. This defaults to a directory "index-cache" within the directory') - print('the data files are in and can be set with the --data-cache-path argument. Please') - print('ensure you have write access to this directory or specify one that you do have') - print('write access to.') + print( + f"There was an error trying to create the data cache directory ({data_cache_dir})" + ) + print( + 'or a file in it. This defaults to a directory "index-cache" within the directory' + ) + print( + "the data files are in and can be set with the --data-cache-path argument. Please" + ) + print( + "ensure you have write access to this directory or specify one that you do have" + ) + print("write access to.") data_cache_success = False - counts = get_accelerator().LongTensor([data_cache_success]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - if counts[0].item() != ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // - torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())): - print_rank_0("Data index creation unsuccessful, exiting.") - exit() - # Load mappings. start_time = time.time() - print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}") - doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading doc-idx mapping from {idx_path['doc']}") + doc_idx = np.load(idx_path["doc"], allow_pickle=True, mmap_mode="r") - print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}") - sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading sample-idx mapping from {idx_path['sample']}") + sample_idx = np.load(idx_path["sample"], allow_pickle=True, mmap_mode="r") - print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}") - shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r') + log.debug(f" > loading shuffle-idx mapping from {idx_path['shuffle']}") + shuffle_idx = np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r") - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - sample_idx.shape[0])) - print_rank_0(' total number of epochs: {}'.format(num_epochs)) + log.debug( + " loaded indexed file in {:3.3f} seconds".format(time.time() - start_time) + ) + log.debug(" total number of samples: {}".format(sample_idx.shape[0])) + log.debug(" total number of epochs: {}".format(num_epochs)) return doc_idx, sample_idx, shuffle_idx, desc, desc_hash @@ -623,24 +1007,25 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples): return num_epochs +@dlp.log def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): """Build an array with length = number-of-epochs * number-of-dcuments. Each index is mapped to a corresponding document.""" if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] doc_idx[:] = documents doc_idx = doc_idx.reshape(-1) doc_idx = doc_idx.astype(np.int32) np_rng.shuffle(doc_idx) return doc_idx - doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False) doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) return np.concatenate((doc_idx_first, doc_idx_last)) -def _build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch): +@dlp.log +def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): """Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 1] is the @@ -674,7 +1059,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, # Note that -1 here is for the same reason we have -1 in # `_num_epochs` calculations. if remaining_seq_length <= 0: - doc_offset += (remaining_seq_length + doc_length - 1) + doc_offset += remaining_seq_length + doc_length - 1 remaining_seq_length = 0 else: # Otherwise, start from the begining of the next document. @@ -688,23 +1073,27 @@ def _build_sample_idx(sizes, doc_idx, seq_length, return sample_idx +@dlp.log def _build_shuffle_idx(num_samples, total_size, np_rng): """Build the range [0, size) and shuffle.""" - print(' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), flush=True) + log.debug( + " > building shuffle index with split [0, {}) and [{}, {}) " "...".format( + num_samples, num_samples, total_size + ) + ) dtype_ = np.uint32 if total_size >= (np.iinfo(np.uint32).max - 1): dtype_ = np.int64 - shuffle_idx_first = np.arange(start=0, stop=num_samples, - step=1, dtype=dtype_) + shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_) np_rng.shuffle(shuffle_idx_first) if num_samples == total_size: return shuffle_idx_first - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, - step=1, dtype=dtype_) + shuffle_idx_last = np.arange( + start=num_samples, stop=total_size, step=1, dtype=dtype_ + ) np_rng.shuffle(shuffle_idx_last) return np.concatenate((shuffle_idx_first, shuffle_idx_last)) diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 5c3a054875..9dee0589b6 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -15,12 +15,28 @@ namespace py = pybind11; using namespace std; const int32_t LONG_SENTENCE_LEN = 512; +void build_concat_indices(py::array_t& dataset_index, py::array_t& dataset_sample_index, + const py::array_t &num_samples, + const int64_t num_datasets, const bool verbose) { + if (verbose) { + std::cout << "> building indices for corpus datasets ..." << std::endl; + } + auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); + auto num_samples_ptr = num_samples.unchecked<1>(); + auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); + int64_t m = 0; + for(uint64_t i=0; i& dataset_index, +void build_blending_indices(py::array_t& dataset_index, py::array_t& dataset_sample_index, const py::array_t& weights, - const int32_t num_datasets, + const int64_t num_datasets, const int64_t size, const bool verbose) { /* Given multiple datasets and a weighting array, build samples such that it follows those wieghts.*/ @@ -58,7 +74,7 @@ void build_blending_indices(py::array_t& dataset_index, } // Populate the indices. - dataset_index_ptr[sample_idx] = static_cast(max_error_index); + dataset_index_ptr[sample_idx] = static_cast(max_error_index); dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; // Update the total samples. @@ -84,7 +100,7 @@ py::array build_sample_idx(const py::array_t& sizes_, const py::array_t& doc_idx_, const int32_t seq_length, const int32_t num_epochs, - const int64_t tokens_per_epoch) { + const int64_t tokens_per_epoch, const bool verbose=false) { /* Sample index (sample_idx) is used for gpt2 like dataset for which the documents are flattened and the samples are built based on this 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] @@ -103,16 +119,17 @@ py::array build_sample_idx(const py::array_t& sizes_, // Mapping and it's length (1D). int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length; int64_t* sample_idx = new int64_t[2*(num_samples+1)]; - - cout << " using:" << endl << std::flush; - cout << " number of documents: " << - doc_idx_.shape(0) / num_epochs << endl << std::flush; - cout << " number of epochs: " << num_epochs << - endl << std::flush; - cout << " sequence length: " << seq_length << - endl << std::flush; - cout << " total number of samples: " << num_samples << - endl << std::flush; + if (verbose) { + cout << " using:" << endl << std::flush; + cout << " number of documents: " << + doc_idx_.shape(0) / num_epochs << endl << std::flush; + cout << " number of epochs: " << num_epochs << + endl << std::flush; + cout << " sequence length: " << seq_length << + endl << std::flush; + cout << " total number of samples: " << num_samples << + endl << std::flush; + } // Index into sample_idx. int64_t sample_index = 0; @@ -698,4 +715,5 @@ PYBIND11_MODULE(helpers, m) { m.def("build_blocks_mapping", &build_blocks_mapping); m.def("build_sample_idx", &build_sample_idx); m.def("build_blending_indices", &build_blending_indices); + m.def("build_concat_indices", &build_concat_indices); } diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index efd3310af0..e2a0c4751f 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -15,13 +15,20 @@ from functools import lru_cache import os + import shutil import struct from itertools import accumulate import numpy as np import torch -from megatron import print_rank_0 + +# from megatron import print_rank_0 +from megatron.utils import Profile, get_logger + +log = get_logger(__name__) + +dlp = Profile("DATASET") def __best_fitting_dtype(vocab_size=None): @@ -32,33 +39,31 @@ def __best_fitting_dtype(vocab_size=None): def get_available_dataset_impl(): - return ['lazy', 'cached', 'mmap'] + return ["lazy", "cached", "mmap"] def infer_dataset_impl(path): if IndexedDataset.exists(path): - with open(index_file_path(path), 'rb') as f: + with open(index_file_path(path), "rb") as f: magic = f.read(8) if magic == IndexedDataset._HDR_MAGIC: - return 'cached' + return "cached" elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: - return 'mmap' + return "mmap" else: return None else: print(f"Dataset does not exist: {path}") print( - "Path should be a basename that both .idx and .bin can be " - "appended to get full filenames." + "Path should be a basename that both .idx and .bin can be appended to get full filenames." ) return None def make_builder(out_file, impl, vocab_size=None): - if impl == 'mmap': + if impl == "mmap": return MMapIndexedDatasetBuilder( - out_file, - dtype=__best_fitting_dtype(vocab_size) + out_file, dtype=__best_fitting_dtype(vocab_size) ) else: return IndexedDatasetBuilder(out_file) @@ -66,28 +71,25 @@ def make_builder(out_file, impl, vocab_size=None): def make_dataset(path, impl, skip_warmup=False): if not IndexedDataset.exists(path): + print(f"Dataset does not exist: {path}") print( - f"Dataset does not exist: {path}" - ) - print( - "Path should be a basename that both .idx and .bin " - "can be appended to get full filenames." + "Path should be a basename that both .idx and .bin can be appended to get full filenames." ) return None - if impl == 'infer': + if impl == "infer": impl = infer_dataset_impl(path) - if impl == 'lazy' and IndexedDataset.exists(path): + if impl == "lazy" and IndexedDataset.exists(path): return IndexedDataset(path) - elif impl == 'cached' and IndexedDataset.exists(path): + elif impl == "cached" and IndexedDataset.exists(path): return IndexedCachedDataset(path) - elif impl == 'mmap' and MMapIndexedDataset.exists(path): + elif impl == "mmap" and MMapIndexedDataset.exists(path): return MMapIndexedDataset(path, skip_warmup) print(f"Unknown dataset implementation: {impl}") return None def dataset_exists(path, impl): - if impl == 'mmap': + if impl == "mmap": return MMapIndexedDataset.exists(path) else: return IndexedDataset.exists(path) @@ -123,11 +125,11 @@ def code(dtype): def index_file_path(prefix_path): - return prefix_path + '.idx' + return prefix_path + ".idx" def data_file_path(prefix_path): - return prefix_path + '.bin' + return prefix_path + ".bin" def create_doc_idx(sizes): @@ -140,7 +142,8 @@ def create_doc_idx(sizes): class IndexedDataset(torch.utils.data.Dataset): """Loader for IndexedDataset""" - _HDR_MAGIC = b'TNTIDX\x00\x00' + + _HDR_MAGIC = b"TNTIDX\x00\x00" def __init__(self, path): super().__init__() @@ -148,45 +151,46 @@ def __init__(self, path): self.data_file = None self.read_index(path) + @dlp.log def read_index(self, path): - with open(index_file_path(path), 'rb') as f: + with open(index_file_path(path), "rb") as f: magic = f.read(8) assert magic == self._HDR_MAGIC, ( - 'Index file doesn\'t match expected format. ' - 'Make sure that --dataset-impl is configured properly.' + "Index file doesn't match expected format. " + "Make sure that --dataset-impl is configured properly." ) version = f.read(8) - assert struct.unpack('= self._len: - raise IndexError('index out of range') + raise IndexError("index out of range") def __del__(self): if self.data_file: self.data_file.close() # @lru_cache(maxsize=8) - def __getitem__(self, idx) -> np.ndarray: + @dlp.log + def __getitem__(self, idx): if not self.data_file: self.read_data(self.path) if isinstance(idx, int): i = idx self.check_index(i) - tensor_size = ( - self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] - ) + tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) @@ -194,16 +198,15 @@ def __getitem__(self, idx) -> np.ndarray: elif isinstance(idx, slice): start, stop, step = idx.indices(len(self)) if step != 1: - raise ValueError( - "Slices into indexed_dataset must be contiguous" - ) - sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]] + raise ValueError("Slices into indexed_dataset must be contiguous") + sizes = self.sizes[self.dim_offsets[start] : self.dim_offsets[stop]] size = sum(sizes) a = np.empty(size, dtype=self.dtype) self.data_file.seek(self.data_offsets[start] * self.element_size) self.data_file.readinto(a) offsets = list(accumulate(sizes)) - return np.split(a, offsets[:-1]) + sents = np.split(a, offsets[:-1]) + return sents def __len__(self): return self._len @@ -216,9 +219,8 @@ def size(self, index): @staticmethod def exists(path): - return ( - os.path.exists(index_file_path(path)) - and os.path.exists(data_file_path(path)) + return os.path.exists(index_file_path(path)) and os.path.exists( + data_file_path(path) ) @property @@ -227,7 +229,6 @@ def supports_prefetch(self): class IndexedCachedDataset(IndexedDataset): - def __init__(self, path): super().__init__(path) self.cache = None @@ -237,6 +238,7 @@ def __init__(self, path): def supports_prefetch(self): return True + @dlp.log def prefetch(self, indices): if all(i in self.cache_index for i in indices): return @@ -252,7 +254,7 @@ def prefetch(self, indices): for i in indices: self.cache_index[i] = ptx size = self.data_offsets[i + 1] - self.data_offsets[i] - a = self.cache[ptx: ptx + size] + a = self.cache[ptx : ptx + size] self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) ptx += size @@ -262,16 +264,15 @@ def prefetch(self, indices): self.data_file = None # @lru_cache(maxsize=8) + @dlp.log def __getitem__(self, idx): if isinstance(idx, int): i = idx self.check_index(i) - tensor_size = ( - self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]] - ) + tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) ptx = self.cache_index[i] - np.copyto(a, self.cache[ptx: ptx + a.size]) + np.copyto(a, self.cache[ptx : ptx + a.size]) return a elif isinstance(idx, slice): # Hack just to make this work, can optimizer later if necessary @@ -292,8 +293,9 @@ class IndexedDatasetBuilder(object): np.float64: 8, } + @dlp.log def __init__(self, out_file, dtype=np.int32): - self.out_file = open(out_file, 'wb') + self.out_file = open(out_file, "wb") self.dtype = dtype self.data_offsets = [0] self.dim_offsets = [0] @@ -301,11 +303,10 @@ def __init__(self, out_file, dtype=np.int32): self.element_size = self.element_sizes[self.dtype] self.doc_idx = [0] + @dlp.log def add_item(self, tensor): bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) - self.data_offsets.append( - self.data_offsets[-1] + bytes / self.element_size - ) + self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) for s in tensor.size(): self.sizes.append(s) self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) @@ -313,6 +314,7 @@ def add_item(self, tensor): def end_document(self): self.doc_idx.append(len(self.sizes)) + @dlp.log def merge_file_(self, another_file): index = IndexedDataset(another_file) assert index.dtype == self.dtype @@ -330,7 +332,7 @@ def merge_file_(self, another_file): self.doc_idx.extend((doc_offset + index.doc_idx)[1:]) - with open(data_file_path(another_file), 'rb') as f: + with open(data_file_path(another_file), "rb") as f: while True: data = f.read(1024) if data: @@ -340,14 +342,12 @@ def merge_file_(self, another_file): def finalize(self, index_file): self.out_file.close() - index = open(index_file, 'wb') - index.write(b'TNTIDX\x00\x00') - index.write(struct.pack(' setting tensorboard ...') _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( log_dir=args.tensorboard_dir, @@ -225,9 +224,9 @@ def _set_timers(args): def _ensure_var_is_initialized(var, name): """Make sure the input variable is not None.""" - assert var is not None, '{} is not initialized.'.format(name) + assert var is not None, f'{name} is not initialized.' def _ensure_var_is_not_initialized(var, name): """Make sure the input variable is not None.""" - assert var is None, '{} is already initialized.'.format(name) + assert var is None, f'{name} is already initialized.' diff --git a/megatron/initialize.py b/megatron/initialize.py index 538f7fc456..90acf496ee 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -185,6 +185,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args): deepspeed.checkpointing.configure( mpu, + deepspeed_config=args.deepspeed_config, partition_activations=args.partition_activations, contiguous_checkpointing=args.contigious_checkpointing, num_checkpoints=num_layers, diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 64158ef99a..f2beea06e8 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -1,23 +1,36 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# type:ignore +# noqa: E401,E402,F401 import torch from deepspeed.accelerator.real_accelerator import get_accelerator -if get_accelerator().device_name() == 'xpu': - import intel_extension_for_pytorch -if get_accelerator().device_name() == 'cuda': +accelerator = get_accelerator() + +if accelerator is not None and accelerator.device_name() == "xpu": + import intel_extension_for_pytorch # noqa: F401 # type: ignore + +if accelerator is not None and accelerator.device_name() == "cuda": from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm - from apex.normalization import MixedFusedRMSNorm as RMSNorm + + try: + from apex.normalization import MixedFusedRMSNorm as RMSNorm # type:ignore + + HAS_APEX = True + except Exception: + HAS_APEX = False + from .rmsnorm import RMSNorm else: if hasattr(torch.xpu, "IpexRmsNorm"): from .fused_rmsnorm import RMSNorm else: - from .rmsnorm import RMSNorm - from torch.nn import LayerNorm - -from .distributed import DistributedDataParallel -from .bert_model import BertModel -from .gpt_model import GPTModel, GPTModelPipe -from .t5_model import T5Model -from .language_model import get_language_model -from .module import Float16Module + from .rmsnorm import RMSNorm # noqa:E401,E402,F401 + from torch.nn import LayerNorm # noqa:E401,E402,F401 + + +from .distributed import DistributedDataParallel # noqa:E401,E402,F401 +from .bert_model import BertModel # noqa:E401,E402,F401 +from .gpt_model import GPTModel, GPTModelPipe # noqa:E401,E402,F401 +from .t5_model import T5Model # noqa:E401,E402,F401 +from .language_model import get_language_model # noqa:E401,E402,F401 +from .module import Float16Module # noqa:E401,E402,F401 diff --git a/megatron/model/fused_rmsnorm.py b/megatron/model/fused_rmsnorm.py index ab1c42e24e..d69b0822a4 100644 --- a/megatron/model/fused_rmsnorm.py +++ b/megatron/model/fused_rmsnorm.py @@ -5,16 +5,20 @@ from torch.nn import init import intel_extension_for_pytorch as ipex # noqa + # Taken from facebookresearch/llama class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False): + def __init__( + self, dim: int, eps: float = 1e-6, sequence_parallel: bool = False + ): super().__init__() self.eps = eps - self.weight = Parameter(torch.ones(dim, - dtype=get_args().params_dtype)) + self.weight = Parameter(torch.ones(dim, dtype=get_args().params_dtype)) self.sequence_parallel = sequence_parallel - setattr(self.weight, 'sequence_parallel', self.sequence_parallel) + setattr(self.weight, "sequence_parallel", self.sequence_parallel) def forward(self, x): - output = torch.xpu.IpexRmsNorm(x, self.weight.shape, self.weight, self.eps) + output = torch.xpu.IpexRmsNorm( + x, self.weight.shape, self.weight, self.eps + ) return output diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 3b8e4e0da1..eebf8744ca 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -390,10 +390,16 @@ def __init__(self, post_process=True, num_experts=[1]): args = get_args() - # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. - if args.untie_embeddings_and_output_weights: assert not add_decoder - super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) - + # TODO: passing `share_embeddings_and_output_weights=False` + # will not work correctly for T5 and embeddings will not be synced. + # Fix later for T5. + if args.untie_embeddings_and_output_weights: + assert not add_decoder + super(TransformerLanguageModel, self).__init__( + share_embeddings_and_output_weights=( + not args.untie_embeddings_and_output_weights + ) + ) self.pre_process = pre_process self.post_process = post_process self.hidden_size = config.hidden_size @@ -406,27 +412,35 @@ def __init__(self, self.add_pooler = add_pooler self.encoder_hidden_state = None self.add_retriever = args.retro_add_retriever - self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + self.untie_embeddings_and_output_weights = ( + args.untie_embeddings_and_output_weights + ) self.num_experts = num_experts # Embeddings. if self.pre_process: - self.embedding = Embedding(self.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - self.num_tokentypes, - args.embedding_weights_in_fp32) + self.embedding = Embedding( + self.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + self.num_tokentypes, + args.embedding_weights_in_fp32 + ) self._embedding_key = 'embedding' # Rotary positional embeddings - self.use_rotary_position_embeddings = \ - args.use_rotary_position_embeddings + self.use_rotary_position_embeddings = ( + args.use_rotary_position_embeddings + ) if args.use_rotary_position_embeddings: self.seq_length = args.seq_length - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -434,15 +448,22 @@ def __init__(self, # partial rotary embeddings, which is better than full rotary # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta) + self.rotary_pos_emb = RotaryEmbedding( + rotary_dim, + theta=args.rope_theta + ) # Encoder (usually set to True, False if part of an encoder-decoder # architecture and in encoder-only stage). if self.add_encoder: self.encoder = ParallelTransformer( config, - model_type=args.model_type if not args.retro_add_retriever \ - else ModelType.retro_decoder, + # args.model_type if not args.retro_add_retriever + # else ModelType.retro_decoder + model_type=( + ModelType.retro_decoder if args.retro_add_retriever + else args.model_type + ), self_attn_mask_type=self.encoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, @@ -462,7 +483,8 @@ def __init__(self, self_attn_mask_type=self.decoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, - num_experts=self.num_experts) + num_experts=self.num_experts + ) self._decoder_key = 'decoder' else: self.decoder = None @@ -479,24 +501,30 @@ def __init__(self, args.padded_vocab_size, config=config, init_method=self.init_method, - bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. + # Setting bias to False always to keep it consistent with + # embedding tying that also does not have a bias. + bias=False + ) self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): """ See megatron.model.transformer.set_input_tensor()""" - # This is usually handled in schedules.py but some inference code still # gives us non-lists or None if not isinstance(input_tensor, list): input_tensor = [input_tensor] if self.add_encoder and self.add_decoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with both encoder and decoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with both encoder and decoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_encoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with only encoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with only encoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_decoder: if len(input_tensor) == 2: @@ -506,32 +534,50 @@ def set_input_tensor(self, input_tensor): self.decoder.set_input_tensor(None) self.encoder_hidden_state = input_tensor[0] else: - raise Exception('input_tensor must have either length 1 or 2') + raise Exception( + 'input_tensor must have either length 1 or 2' + ) else: - raise Exception('Stage must have at least either encoder or decoder') - - def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, - dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, - retriever_input_ids=None, - retriever_position_ids=None, - retriever_attn_mask=None, - enc_dec_attn_mask=None, tokentype_ids=None, - inference_params=None, - pooling_sequence_index=0, - enc_hidden_states=None, output_enc_hidden=False): + raise Exception( + 'Stage must have at least either encoder or decoder' + ) + + def forward( + self, + enc_input_ids, + enc_position_ids, + enc_attn_mask, + dec_input_ids=None, + dec_position_ids=None, + dec_attn_mask=None, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + enc_dec_attn_mask=None, + tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, + output_enc_hidden=False + ): args = get_args() # Encoder embedding. if self.pre_process: - encoder_input = self.embedding(enc_input_ids, enc_position_ids, - tokentype_ids=tokentype_ids) + encoder_input = self.embedding( + enc_input_ids, + enc_position_ids, + tokentype_ids=tokentype_ids + ) else: encoder_input = None # Retriever embedding. if self.add_retriever and self.pre_process: - retriever_input = self.embedding(retriever_input_ids, - retriever_position_ids, - tokentype_ids=tokentype_ids) + retriever_input = self.embedding( + retriever_input_ids, + retriever_position_ids, + tokentype_ids=tokentype_ids + ) else: retriever_input = None diff --git a/megatron/model/rmsnorm.py b/megatron/model/rmsnorm.py index 7bcaec37ef..68b792c83d 100644 --- a/megatron/model/rmsnorm.py +++ b/megatron/model/rmsnorm.py @@ -7,20 +7,23 @@ from torch.nn import init from torch.nn.parameter import Parameter + # Taken from facebookresearch/llama class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float = 1e-6, sequence_parallel=False): + def __init__( + self, dim: int, eps: float = 1e-6, sequence_parallel: bool = False + ): super().__init__() self.eps = eps init_device = None - if get_accelerator().device_name() == 'hpu': - init_device = get_accelerator().current_device_name() - self.weight = Parameter(torch.empty(dim, - device=init_device, - dtype=get_args().params_dtype)) + if get_accelerator().device_name() == "hpu": + init_device = get_accelerator().current_device_name() + self.weight = Parameter( + torch.empty(dim, device=init_device, dtype=get_args().params_dtype) + ) init.ones_(self.weight) self.sequence_parallel = sequence_parallel - setattr(self.weight, 'sequence_parallel', self.sequence_parallel) + setattr(self.weight, "sequence_parallel", self.sequence_parallel) def _norm(self, x): return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 74e977103f..592ff2855b 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -25,6 +25,7 @@ try: from deepspeed.sequence.layer import DistributedAttention + dist_attn_supported = True except ImportError: dist_attn_supported = False @@ -70,28 +71,31 @@ hyperparameters: transformer hyperparameters """ + class DropPath(MegatronModule): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ - def __init__(self, drop_prob=0.): + def __init__(self, drop_prob=0.0): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, hidden_state): - if self.drop_prob == 0. or not self.training: + if self.drop_prob == 0.0 or not self.training: return hidden_state keep_prob = 1 - self.drop_prob # work with diff dim tensors, not just 2D ConvNets # hidden_state: [s, b, h] shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2) - random_tensor = keep_prob + \ - torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device) + random_tensor = keep_prob + torch.rand( + shape, dtype=hidden_state.dtype, device=hidden_state.device + ) random_tensor.floor_() # binarize output = hidden_state.div(keep_prob) * random_tensor return output + class ParallelMLP(MegatronModule): """MLP. @@ -120,7 +124,7 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): gather_output=False, skip_bias_add=True, moe=moe, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, ) self.bias_gelu_fusion = False @@ -132,13 +136,17 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): elif args.onnx_safe: self.activation_func = erf_gelu elif args.swiglu: + def swiglu(x): x = torch.chunk(x, 2, dim=-1) return F.silu(x[0]) * x[1] + self.activation_func = swiglu elif args.squared_relu: + def squared_relu(x): return torch.pow(F.relu(x), 2) + self.activation_func = squared_relu else: self.bias_gelu_fusion = args.bias_gelu_fusion @@ -153,7 +161,7 @@ def squared_relu(x): bias=self.add_bias, input_is_parallel=True, moe=moe, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, ) def forward(self, hidden_states): @@ -175,10 +183,12 @@ def forward(self, hidden_states): output, output_bias = self.dense_4h_to_h(intermediate_parallel) return output, output_bias + class SwitchMLP(MegatronModule): """ Routes input to one of N MLP "experts" """ + def __init__(self, config): super(SwitchMLP, self).__init__() args = get_args() @@ -195,29 +205,29 @@ def forward(self, hidden_states): route = self.router(hidden_states) route = torch.nn.functional.softmax(route, dim=2) max_prob, max_ind = torch.max(route, dim=2) - max_prob = torch.unsqueeze(max_prob, 2) # [s b 1] + max_prob = torch.unsqueeze(max_prob, 2) # [s b 1] # TODO (rprenger) TODO this could be made easier to read # Converting [s, b, h] to [s*b, h]. # Each vector could be routed differently - hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h] - max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1] - max_ind = max_ind.view(-1) # [s*b] + hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h] + max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1] + max_ind = max_ind.view(-1) # [s*b] output_total = torch.empty_like(hidden_states) output_bias_total = torch.empty_like(hidden_states) - #TODO (rprenger) This does each expert in serial, but it could be parallelized + # TODO (rprenger) This does each expert in serial, but it could be parallelized for expert_num, expert in enumerate(self.experts): local_indices = (max_ind == expert_num).nonzero() - hidden = hidden_states[local_indices,:] + hidden = hidden_states[local_indices, :] output, output_bias = expert(hidden) output_bias = output_bias.expand_as(output) - output_total[local_indices,:] = output - output_bias_total[local_indices,:] = output_bias + output_total[local_indices, :] = output + output_bias_total[local_indices, :] = output_bias - output_total = output_total*max_prob - output_bias_total = output_bias_total*max_prob + output_total = output_total * max_prob + output_bias_total = output_bias_total * max_prob output_total = output_total.view(s, b, h) output_bias_total = output_bias_total.view(s, b, h) @@ -226,8 +236,7 @@ def forward(self, hidden_states): class CoreAttention(MegatronModule): - def __init__(self, layer_number, config, - attn_mask_type=AttnMaskType.padding): + def __init__(self, layer_number, config, attn_mask_type=AttnMaskType.padding): super(CoreAttention, self).__init__() self.fp16 = config.fp16 self.bf16 = config.bf16 @@ -246,14 +255,19 @@ def __init__(self, layer_number, config, seq_parallel_world_size = 1 if parallel_state.sequence_parallel_is_initialized(): seq_parallel_world_size = parallel_state.get_sequence_parallel_world_size() - world_size = seq_parallel_world_size if seq_parallel_world_size > 1 else parallel_state.get_tensor_model_parallel_world_size() + world_size = ( + seq_parallel_world_size + if seq_parallel_world_size > 1 + else parallel_state.get_tensor_model_parallel_world_size() + ) - self.hidden_size_per_partition = core.utils.divide(projection_size, - world_size) + self.hidden_size_per_partition = core.utils.divide(projection_size, world_size) self.hidden_size_per_attention_head = core.utils.divide( - projection_size, config.num_attention_heads) + projection_size, config.num_attention_heads + ) self.num_attention_heads_per_partition = core.utils.divide( - config.num_attention_heads, world_size) + config.num_attention_heads, world_size + ) coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -262,49 +276,56 @@ def __init__(self, layer_number, config, self.norm_factor *= coeff self.scale_mask_softmax = FusedScaleMaskSoftmax( - self.fp16, self.bf16, + self.fp16, + self.bf16, self.attn_mask_type, config.masked_softmax_fusion, attention_mask_func, self.attention_softmax_in_fp32, - coeff) + coeff, + ) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. self.attention_dropout = torch.nn.Dropout(config.attention_dropout) - def forward(self, query_layer, key_layer, - value_layer, attention_mask): + def forward(self, query_layer, key_layer, value_layer, attention_mask): # =================================== # Raw attention scores. [b, np, s, s] # =================================== # [b, np, sq, sk] - output_size = (query_layer.size(1), - query_layer.size(2), - query_layer.size(0), - key_layer.size(0)) + output_size = ( + query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0), + ) # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view(output_size[2], - output_size[0] * output_size[1], -1) + query_layer = query_layer.view( + output_size[2], output_size[0] * output_size[1], -1 + ) # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], - output_size[0] * output_size[1], -1) + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( - (output_size[0]*output_size[1], output_size[2], output_size[3]), - query_layer.dtype, "mpu") + (output_size[0] * output_size[1], output_size[2], output_size[3]), + query_layer.dtype, + "mpu", + ) # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, - query_layer.transpose(0, 1), # [b * np, sq, hn] + query_layer.transpose(0, 1), # [b * np, sq, hn] key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] - beta=0.0, alpha=(1.0/self.norm_factor)) + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) @@ -314,8 +335,7 @@ def forward(self, query_layer, key_layer, # =========================== # attention scores and attention mask [b, np, sq, sk] - attention_probs = self.scale_mask_softmax(attention_scores, - attention_mask) + attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -333,18 +353,22 @@ def forward(self, query_layer, key_layer, # [sk, b, np, hn] --> [b, np, sq, hn] # context layer shape: [b, np, sq, hn] - output_size = (value_layer.size(1), - value_layer.size(2), - query_layer.size(0), - value_layer.size(3)) + output_size = ( + value_layer.size(1), + value_layer.size(2), + query_layer.size(0), + value_layer.size(3), + ) # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), - output_size[0] * output_size[1], -1) + value_layer = value_layer.view( + value_layer.size(0), output_size[0] * output_size[1], -1 + ) # change view [b * np, sq, sk] - attention_probs = attention_probs.view(output_size[0] * output_size[1], - output_size[2], -1) + attention_probs = attention_probs.view( + output_size[0] * output_size[1], output_size[2], -1 + ) # matmul: [b * np, sq, hn] context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) @@ -356,8 +380,9 @@ def forward(self, query_layer, key_layer, context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.hidden_size_per_partition,) + new_context_layer_shape = context_layer.size()[:-2] + ( + self.hidden_size_per_partition, + ) context_layer = context_layer.view(*new_context_layer_shape) return context_layer @@ -373,12 +398,24 @@ class FlashSelfAttention(torch.nn.Module): attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ - def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, - device=None, dtype=None): + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + device=None, + dtype=None, + ): super().__init__() - assert flash_attn_unpadded_func is not None or flash_attn_varlen_func is not None or flash_attn_builder is not None, \ - ('Please install FlashAttention first, e.g., with pip install flash-attn or implement your own flash attention') - assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + assert ( + flash_attn_unpadded_func is not None + or flash_attn_varlen_func is not None + or flash_attn_builder is not None + ), "Please install FlashAttention first, e.g., with pip install flash-attn or implement your own flash attention" + assert ( + rearrange is not None + ), "Please install einops first, e.g., with pip install einops" self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout @@ -389,14 +426,18 @@ def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, self.use_flash_attn_builder_v2 = False self.use_flash_attn = False if args.use_flash_attn_builder: - if hasattr(flash_attn_builder, 'flash_attn_func'): + if hasattr(flash_attn_builder, "flash_attn_func"): self.flash_attn_func = flash_attn_builder.flash_attn_func self.use_flash_attn_builder_v1 = True else: self.flash_attn_func = flash_attn_builder.flash_attn_func_v2 self.use_flash_attn_builder_v2 = True else: - self.flash_attn_func = flash_attn_varlen_func if args.use_flash_attn_v2 else flash_attn_unpadded_func + self.flash_attn_func = ( + flash_attn_varlen_func + if args.use_flash_attn_v2 + else flash_attn_unpadded_func + ) self.use_flash_attn = True def forward(self, q, k, v): @@ -406,42 +447,67 @@ def forward(self, q, k, v): q, k, v: The tensor containing the query, key, and value. (B, S, H, D) """ - assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) + assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v))) assert all((get_accelerator().on_accelerator(i) for i in (q, k, v))) batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] if self.use_flash_attn: - q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] - cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, - device=q.device) + q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=torch.int32, + device=q.device, + ) elif self.use_flash_attn_builder_v1: - q, k, v = [rearrange(x, 'b s h d -> b h s d').contiguous() for x in [q, k, v]] + q, k, v = [ + rearrange(x, "b s h d -> b h s d").contiguous() for x in [q, k, v] + ] else: # use_flash_attn_builder_v2 - q, k, v = [rearrange(x, 'b s h d -> b h s d') for x in [q, k, v]] + q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]] if self.training: # during training q,k,v always have same seqlen assert seqlen_k == seqlen_q is_causal = self.causal - cu_seqlens_k = cu_seqlens_q if get_accelerator().device_name() == 'cuda' else None + cu_seqlens_k = ( + cu_seqlens_q if get_accelerator().device_name() == "cuda" else None + ) dropout_p = self.dropout_p else: # turn off FA causal mask after first inference autoregressive iteration # only on first autoregressive step q,k,v have same seqlen is_causal = seqlen_q == seqlen_k - cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, - device=q.device) if get_accelerator().device_name() == 'cuda' else None + cu_seqlens_k = ( + torch.arange( + 0, + (batch_size + 1) * seqlen_k, + step=seqlen_k, + dtype=torch.int32, + device=q.device, + ) + if get_accelerator().device_name() == "cuda" + else None + ) dropout_p = 0 if self.use_flash_attn: output = self.flash_attn_func( - q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + seqlen_q, + seqlen_k, dropout_p, - softmax_scale=self.softmax_scale, causal=is_causal + softmax_scale=self.softmax_scale, + causal=is_causal, ) else: # use_flash_attn_builder @@ -450,15 +516,16 @@ def forward(self, q, k, v): ) if self.use_flash_attn: - output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) elif self.use_flash_attn_builder_v1: - output = rearrange(output, 'b h s d -> b s h d').contiguous() + output = rearrange(output, "b h s d -> b s h d").contiguous() else: # use_flash_attn_builder_v2: - output = rearrange(output, 'b h s d -> b s h d') + output = rearrange(output, "b h s d -> b s h d") return output + class FlashSelfAttentionTriton(torch.nn.Module): """Implement the scaled dot product attention with softmax. Arguments @@ -469,11 +536,22 @@ class FlashSelfAttentionTriton(torch.nn.Module): attention_dropout: The dropout rate to apply to the attention (default: 0.0) """ - def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, - device=None, dtype=None): + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + device=None, + dtype=None, + ): super().__init__() - assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.') - assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + assert ( + flash_attn_func is not None + ), "Triton version of FlashAttention is not installed." + assert ( + rearrange is not None + ), "Please install einops first, e.g., with pip install einops" self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout @@ -487,13 +565,13 @@ def forward(self, q, k, v): assert q.dtype in [torch.float16, torch.bfloat16] assert q.is_cuda - q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (q, k, v)] - + q, k, v = [rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)] + output = flash_attn_func(q, k, v, None, self.causal) - output = rearrange(output, 'b s h d -> s b (h d)').contiguous() + output = rearrange(output, "b s h d -> s b (h d)").contiguous() return output + class ParallelAttention(MegatronModule): """Parallel self-attention layer abstract class. @@ -501,9 +579,13 @@ class ParallelAttention(MegatronModule): and returns output of the same size. """ - def __init__(self, config, layer_number, - attention_type=AttnType.self_attn, - attn_mask_type=AttnMaskType.padding): + def __init__( + self, + config, + layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.padding, + ): super(ParallelAttention, self).__init__() args = get_args() self.layer_number = max(1, layer_number) @@ -513,12 +595,18 @@ def __init__(self, config, layer_number, self.sequence_parallel = config.sequence_parallel self.num_attention_heads = config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads - self.use_gqa = (self.num_attention_heads != self.num_key_value_heads) - - self.use_flash_attn = (args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2 or \ - args.use_flash_attn_builder) \ - and attention_type == AttnType.self_attn \ + self.use_gqa = self.num_attention_heads != self.num_key_value_heads + + self.use_flash_attn = ( + ( + args.use_flash_attn_v1 + or args.use_flash_attn_triton + or args.use_flash_attn_v2 + or args.use_flash_attn_builder + ) + and attention_type == AttnType.self_attn and self.attn_mask_type == AttnMaskType.causal + ) self.use_flash_attn_triton = args.use_flash_attn_triton if self.use_flash_attn: global flash_attn_builder @@ -528,38 +616,53 @@ def __init__(self, config, layer_number, flash_attn_builder = None if args.use_flash_attn_v1: - assert flash_attn_unpadded_func != None, "Cannot import FlashAttention v1 " + assert ( + flash_attn_unpadded_func != None + ), "Cannot import FlashAttention v1 " if args.use_flash_attn_v2: - assert flash_attn_varlen_func != None, "Cannot import FlashAttention v2 " + assert ( + flash_attn_varlen_func != None + ), "Cannot import FlashAttention v2 " if args.use_flash_attn_triton: assert flash_attn_func != None, "Cannot import FlashAttention triton " if args.use_flash_attn_builder: - assert flash_attn_builder != None, "Cannot find FlashAttention op builder " + assert ( + flash_attn_builder != None + ), "Cannot find FlashAttention op builder " - assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' - 'self-attention for now') - assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' - 'supports causal mask for now') + assert attention_type == AttnType.self_attn, ( + "FlashAttention code path only supports " "self-attention for now" + ) + assert self.attn_mask_type == AttnMaskType.causal, ( + "FlashAttention code path only " "supports causal mask for now" + ) if rearrange is None: - raise ImportError('einops is not installed, please install with pip install einops') + raise ImportError( + "einops is not installed, please install with pip install einops" + ) projection_size = config.kv_channels * config.num_attention_heads # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() self.hidden_size_per_attention_head = core.utils.divide( - projection_size, config.num_attention_heads) + projection_size, config.num_attention_heads + ) self.num_attention_heads_per_partition = core.utils.divide( - config.num_attention_heads, world_size) + config.num_attention_heads, world_size + ) # Per GQA head and per partition values self.num_key_value_heads_per_partition = core.utils.divide( - config.num_key_value_heads, world_size) + config.num_key_value_heads, world_size + ) self.num_key_value_groups = core.utils.divide( - config.num_attention_heads, config.num_key_value_heads) + config.num_attention_heads, config.num_key_value_heads + ) kv_projection_size = config.kv_channels * config.num_key_value_heads assert self.hidden_size_per_attention_head == core.utils.divide( - kv_projection_size, config.num_key_value_heads) + kv_projection_size, config.num_key_value_heads + ) # Strided linear layer. if attention_type == AttnType.self_attn: @@ -569,7 +672,8 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=args.add_bias_linear, - gather_output=False) + gather_output=False, + ) else: assert attention_type == AttnType.cross_attn self.query = tensor_parallel.ColumnParallelLinear( @@ -578,8 +682,8 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=config.add_bias_linear, - gather_output=False) - + gather_output=False, + ) self.key_value = tensor_parallel.ColumnParallelLinear( config.hidden_size, @@ -587,32 +691,48 @@ def __init__(self, config, layer_number, config=config, init_method=config.init_method, bias=config.add_bias_linear, - gather_output=False) + gather_output=False, + ) # Currently FlashAttention only works with causal mask if self.use_flash_attn_triton: - local_attn = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout) + local_attn = FlashSelfAttentionTriton( + causal=True, attention_dropout=args.attention_dropout + ) elif self.use_flash_attn: - local_attn = FlashSelfAttention(causal=True, attention_dropout=config.attention_dropout) + local_attn = FlashSelfAttention( + causal=True, attention_dropout=config.attention_dropout + ) else: local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type) - self.enable_ds_sequence_parallel = parallel_state.get_sequence_parallel_world_size() > 1 \ - or args.force_ds_sequence_parallel + self.enable_ds_sequence_parallel = ( + parallel_state.get_sequence_parallel_world_size() > 1 + or args.force_ds_sequence_parallel + ) if self.enable_ds_sequence_parallel: - assert dist_attn_supported, 'Distributed attention is not supported in this DeepSpeed version' - assert args.num_attention_heads % parallel_state.get_sequence_parallel_world_size() == 0 + assert ( + dist_attn_supported + ), "Distributed attention is not supported in this DeepSpeed version" + assert ( + args.num_attention_heads + % parallel_state.get_sequence_parallel_world_size() + == 0 + ) self.dist_attn = DistributedAttention( - local_attn, - parallel_state.get_sequence_parallel_group(), - gather_idx=1 if args.use_flash_attn_v1 or args.use_flash_attn_v2 else 0) + local_attn, + parallel_state.get_sequence_parallel_group(), + gather_idx=1 if args.use_flash_attn_v1 or args.use_flash_attn_v2 else 0, + ) # flash_attn_cuda assumes [b, s, nh, hd] layout, we need to make sure all2all gathers into the correct sequence dimension. else: if self.use_flash_attn: self.core_attention_flash = local_attn else: self.core_attention = local_attn - self.checkpoint_core_attention = config.recompute_granularity == 'selective' + self.checkpoint_core_attention = ( + config.recompute_granularity == "selective" + ) # Output. self.dense = tensor_parallel.RowParallelLinear( @@ -622,29 +742,38 @@ def __init__(self, config, layer_number, init_method=config.output_layer_init_method, bias=args.add_bias_linear, input_is_parallel=True, - skip_bias_add=True) - + skip_bias_add=True, + ) - def _checkpointed_attention_forward(self, query_layer, key_layer, - value_layer, attention_mask, - rotary_pos_emb=None): + def _checkpointed_attention_forward( + self, query_layer, key_layer, value_layer, attention_mask, rotary_pos_emb=None + ): """Forward method with activation checkpointing.""" + def custom_forward(*inputs): query_layer = inputs[0] key_layer = inputs[1] value_layer = inputs[2] attention_mask = inputs[3] - output_ = self.core_attention(query_layer, key_layer, - value_layer, attention_mask) + output_ = self.core_attention( + query_layer, key_layer, value_layer, attention_mask + ) return output_ - q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \ - else rotary_pos_emb + q_pos_emb, k_pos_emb = ( + (None, None) if rotary_pos_emb is None else rotary_pos_emb + ) hidden_states = tensor_parallel.checkpoint( custom_forward, - False, query_layer, key_layer, value_layer, attention_mask, - q_pos_emb, k_pos_emb) + False, + query_layer, + key_layer, + value_layer, + attention_mask, + q_pos_emb, + k_pos_emb, + ) return hidden_states @@ -655,7 +784,8 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size): self.num_attention_heads_per_partition, self.hidden_size_per_attention_head, dtype=self.params_dtype, - device=get_accelerator().current_device_name()) + device=get_accelerator().current_device_name(), + ) def repeat_kv(self, hidden_states, n_rep): slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape @@ -667,22 +797,36 @@ def repeat_kv(self, hidden_states, n_rep): return hidden_states.expand(slen, batch, n_rep, head_dim) else: hidden_states = hidden_states[:, :, :, None, :].expand( - slen, batch, num_key_value_heads_per_partition, n_rep, head_dim) - return hidden_states.reshape(slen, batch, - num_key_value_heads_per_partition * n_rep, - head_dim) - + slen, batch, num_key_value_heads_per_partition, n_rep, head_dim + ) + return hidden_states.reshape( + slen, batch, num_key_value_heads_per_partition * n_rep, head_dim + ) + def split_tensor(self, mixed_x_layer): - query_layer, key_layer, value_layer = torch.split(mixed_x_layer, [self.num_key_value_groups, 1, 1], dim=-2) - query_layer = query_layer.reshape(mixed_x_layer.shape[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)) + query_layer, key_layer, value_layer = torch.split( + mixed_x_layer, [self.num_key_value_groups, 1, 1], dim=-2 + ) + query_layer = query_layer.reshape( + mixed_x_layer.shape[:2] + + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + ) key_layer = torch.squeeze(key_layer, -2) value_layer = torch.squeeze(value_layer, -2) return query_layer, key_layer, value_layer - def forward(self, hidden_states, attention_mask, - encoder_output=None, inference_params=None, - rotary_pos_emb=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + inference_params=None, + rotary_pos_emb=None, + ): # hidden_states: [sq, b, h] # ================================================= @@ -694,15 +838,20 @@ def forward(self, hidden_states, attention_mask, inf_max_seq_len = inference_params.max_sequence_len inf_max_batch_size = inference_params.max_batch_size inference_key_memory = self._allocate_memory( - inf_max_seq_len, inf_max_batch_size) + inf_max_seq_len, inf_max_batch_size + ) inference_value_memory = self._allocate_memory( - inf_max_seq_len, inf_max_batch_size) + inf_max_seq_len, inf_max_batch_size + ) inference_params.key_value_memory_dict[self.layer_number] = ( - inference_key_memory, inference_value_memory) + inference_key_memory, + inference_value_memory, + ) is_first_step = True else: - inference_key_memory, inference_value_memory = \ + inference_key_memory, inference_value_memory = ( inference_params.key_value_memory_dict[self.layer_number] + ) # ===================== # Query, Key, and Value @@ -713,43 +862,45 @@ def forward(self, hidden_states, attention_mask, mixed_x_layer, _ = self.query_key_value(hidden_states) # [sq, b, ((nq + 2 * nkv) * hn)] --> [sq, b, nkv, (nq // nkv + 2), hn] - new_tensor_shape = mixed_x_layer.size()[:-1] + \ - (-1, (self.num_key_value_groups + 2), - self.hidden_size_per_attention_head) + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + -1, + (self.num_key_value_groups + 2), + self.hidden_size_per_attention_head, + ) mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, nkv, (nq // nkv + 2), hn] --> 3 [sq, b, np, hn] - (query_layer, - key_layer, - value_layer) = self.split_tensor(mixed_x_layer) + (query_layer, key_layer, value_layer) = self.split_tensor(mixed_x_layer) # Repeat kv if self.use_gqa: key_layer = self.repeat_kv(key_layer, self.num_key_value_groups) - value_layer = self.repeat_kv(value_layer, - self.num_key_value_groups) + value_layer = self.repeat_kv(value_layer, self.num_key_value_groups) else: - assert not self.use_gqa, 'GQA + cross-attn not tested yet' + assert not self.use_gqa, "GQA + cross-attn not tested yet" # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] - new_tensor_shape = mixed_kv_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - 2 * self.hidden_size_per_attention_head) + new_tensor_shape = mixed_kv_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head, + ) mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] - (key_layer, - value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2) + (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim( + mixed_kv_layer, 2 + ) # Attention head [sq, b, h] --> [sq, b, hp] query_layer, _ = self.query(hidden_states) # [sq, b, hp] --> [sq, b, np, hn] - new_tensor_shape = query_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head) + new_tensor_shape = query_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) query_layer = query_layer.view(*new_tensor_shape) # ================================== @@ -761,7 +912,7 @@ def forward(self, hidden_states, attention_mask, if isinstance(rotary_pos_emb, tuple): rotary_pos_emb = rotary_pos_emb else: - rotary_pos_emb = ((rotary_pos_emb,) * 2) + rotary_pos_emb = (rotary_pos_emb,) * 2 if inference_params: batch_start = inference_params.batch_size_offset @@ -771,15 +922,16 @@ def forward(self, hidden_states, attention_mask, sequence_end = sequence_start + key_layer.size(0) assert sequence_end <= inference_key_memory.size(0) # Copy key and values. - inference_key_memory[sequence_start:sequence_end, - batch_start:batch_end, ...] = key_layer - inference_value_memory[sequence_start:sequence_end, - batch_start:batch_end, ...] = value_layer - key_layer = inference_key_memory[ - :sequence_end, batch_start:batch_end, ...] + inference_key_memory[ + sequence_start:sequence_end, batch_start:batch_end, ... + ] = key_layer + inference_value_memory[ + sequence_start:sequence_end, batch_start:batch_end, ... + ] = value_layer + key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...] value_layer = inference_value_memory[ - :sequence_end, batch_start:batch_end, ...] - + :sequence_end, batch_start:batch_end, ... + ] # adjust the key rotary positional embedding if rotary_pos_emb is not None: @@ -801,7 +953,6 @@ def forward(self, hidden_states, attention_mask, k_pos_emb = k_pos_emb[:sequence_end, :, :, :] rotary_pos_emb = (q_pos_emb, k_pos_emb) - # ================================== # core attention computation # ================================== @@ -820,37 +971,55 @@ def forward(self, hidden_states, attention_mask, batch_dim_idx = 1 if self.use_flash_attn: if not self.use_flash_attn_triton: - query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (query_layer, key_layer, value_layer)] + query_layer, key_layer, value_layer = [ + rearrange(x, "s b ... -> b s ...").contiguous() + for x in (query_layer, key_layer, value_layer) + ] batch_dim_idx = 0 - context_layer = self.dist_attn(query_layer, key_layer, value_layer, batch_dim_idx) + context_layer = self.dist_attn( + query_layer, key_layer, value_layer, batch_dim_idx + ) if not self.use_flash_attn_triton: - context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() else: - context_layer = self.dist_attn(query_layer, key_layer, value_layer, attention_mask) + context_layer = self.dist_attn( + query_layer, key_layer, value_layer, attention_mask + ) else: if self.use_flash_attn: if not self.use_flash_attn_triton: - query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous() - for x in (query_layer, key_layer, value_layer)] + query_layer, key_layer, value_layer = [ + rearrange(x, "s b ... -> b s ...").contiguous() + for x in (query_layer, key_layer, value_layer) + ] if self.sequence_parallel: - context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) + context_layer = self.core_attention_flash( + query_layer, key_layer, value_layer + ) else: with tensor_parallel.get_cuda_rng_tracker().fork(): - context_layer = self.core_attention_flash(query_layer, key_layer, value_layer) + context_layer = self.core_attention_flash( + query_layer, key_layer, value_layer + ) if not self.use_flash_attn_triton: - context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() else: if self.checkpoint_core_attention: context_layer = self._checkpointed_attention_forward( - query_layer, key_layer, value_layer, attention_mask) + query_layer, key_layer, value_layer, attention_mask + ) else: context_layer = self.core_attention( - query_layer, key_layer, value_layer, attention_mask) + query_layer, key_layer, value_layer, attention_mask + ) # ================= # Output. [sq, b, h] @@ -873,22 +1042,21 @@ def bias_dropout_add(x, bias, residual, prob, training): def get_bias_dropout_add(training): def _bias_dropout_add(x, bias, residual, prob): return bias_dropout_add(x, bias, residual, prob, training) + return _bias_dropout_add @torch.jit.script -def bias_dropout_add_fused_train(x: torch.Tensor, - bias: Optional[torch.Tensor], - residual: torch.Tensor, - prob: float) -> torch.Tensor: +def bias_dropout_add_fused_train( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: return bias_dropout_add(x, bias, residual, prob, True) @torch.jit.script -def bias_dropout_add_fused_inference(x: torch.Tensor, - bias: Optional[torch.Tensor], - residual: torch.Tensor, - prob: float) -> torch.Tensor: +def bias_dropout_add_fused_inference( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: return bias_dropout_add(x, bias, residual, prob, False) @@ -899,10 +1067,15 @@ class ParallelTransformerLayer(MegatronModule): output of the same size. """ - def __init__(self, config, - layer_number, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding, - drop_path_rate=0., num_experts=1): + def __init__( + self, + config, + layer_number, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.0, + num_experts=1, + ): # retriever=None): args = get_args() @@ -910,110 +1083,130 @@ def __init__(self, config, self.layer_number = layer_number self.layer_type = layer_type - self.apply_residual_connection_post_layernorm \ - = config.apply_residual_connection_post_layernorm + self.apply_residual_connection_post_layernorm = ( + config.apply_residual_connection_post_layernorm + ) self.bf16 = config.bf16 self.fp32_residual_connection = config.fp32_residual_connection # Layernorm on the input data. - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.input_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=args.no_persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.input_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.input_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon, - sequence_parallel=config.sequence_parallel) + self.input_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) + # self.input_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon_ # Self attention. self.self_attention = ParallelAttention( config, layer_number, attention_type=AttnType.self_attn, - attn_mask_type=self_attn_mask_type) + attn_mask_type=self_attn_mask_type, + ) self.hidden_dropout = config.hidden_dropout self.bias_dropout_fusion = config.bias_dropout_fusion self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None # Layernorm on the attention output - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.post_attention_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=not config.persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.post_attention_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.post_attention_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon, - sequence_parallel=config.sequence_parallel) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) # Cross attention. - if self.layer_type in (LayerType.decoder, - LayerType.retro_decoder, - LayerType.retro_decoder_with_retriever, - LayerType.retro_encoder): + if self.layer_type in ( + LayerType.decoder, + LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever, + LayerType.retro_encoder, + ): self.inter_attention = ParallelAttention( - config, - layer_number, - attention_type=AttnType.cross_attn) + config, layer_number, attention_type=AttnType.cross_attn + ) # Layernorm on the attention output. - if args.normalization == 'layernorm': + if args.normalization == "layernorm": self.post_inter_attention_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=not config.persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: - self.post_inter_attention_layernorm = RMSNorm(config.hidden_size, - config.layernorm_epsilon, - sequence_parallel=config.sequence_parallel) + self.post_inter_attention_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) # MLP self.num_experts = num_experts if args.num_experts_switch is not None: - self.mlp = SwitchMLP(config) # Megatron-LM's MoE + self.mlp = SwitchMLP(config) # Megatron-LM's MoE else: - if self.num_experts <= 1: # dense, not MoE + if self.num_experts <= 1: # dense, not MoE self.mlp = ParallelMLP(config) - else: # DeepSpeed's MoE + else: # DeepSpeed's MoE enable_expert_tensor_parallelism = args.enable_expert_tensor_parallelism - self.mlp = MoE(args.hidden_size, - ParallelMLP(config, - moe=True, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism), - num_experts=self.num_experts, - ep_size=args.moe_expert_parallel_size, - k=args.topk, - use_residual=(args.mlp_type == 'residual'), - capacity_factor=args.moe_train_capacity_factor, - eval_capacity_factor=args.moe_eval_capacity_factor, - min_capacity=args.moe_min_capacity, - drop_tokens=args.moe_token_dropping, - use_tutel=args.use_tutel, - enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, - top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling) + self.mlp = MoE( + args.hidden_size, + ParallelMLP( + config, + moe=True, + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, + ), + num_experts=self.num_experts, + ep_size=args.moe_expert_parallel_size, + k=args.topk, + use_residual=(args.mlp_type == "residual"), + capacity_factor=args.moe_train_capacity_factor, + eval_capacity_factor=args.moe_eval_capacity_factor, + min_capacity=args.moe_min_capacity, + drop_tokens=args.moe_token_dropping, + use_tutel=args.use_tutel, + enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, + top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling, + ) # Set bias+dropout+add fusion grad_enable execution handler. - TORCH_MAJOR = int(torch.__version__.split('.')[0]) - TORCH_MINOR = int(torch.__version__.split('.')[1]) + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) - self.bias_dropout_add_exec_handler = \ - nullcontext if use_nvfuser else torch.enable_grad + self.bias_dropout_add_exec_handler = ( + nullcontext if use_nvfuser else torch.enable_grad + ) if args.retro_add_retriever: retro_args = get_retro_args() @@ -1031,23 +1224,24 @@ def __init__(self, config, pre_process=True, post_process=False, ) - self._retriever_key = 'retriever' + self._retriever_key = "retriever" else: self.retriever = None - def default_decoder_cross_attention(self, - encoder_output, - enc_dec_attn_mask, - layernorm_input, - layernorm_output, - bias_dropout_add_func): - '''Cross attention for a standard encoder-decoder model.''' + def default_decoder_cross_attention( + self, + encoder_output, + enc_dec_attn_mask, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ): + """Cross attention for a standard encoder-decoder model.""" # Attention. - attention_output, attention_bias = \ - self.inter_attention(layernorm_output, - enc_dec_attn_mask, - encoder_output=encoder_output) + attention_output, attention_bias = self.inter_attention( + layernorm_output, enc_dec_attn_mask, encoder_output=encoder_output + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1061,21 +1255,17 @@ def default_decoder_cross_attention(self, # Bias-dropout-add. with self.bias_dropout_add_exec_handler(): layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias, - residual, - self.hidden_dropout) + attention_output, attention_bias, residual, self.hidden_dropout + ) # Layer norm. layernorm_output = self.post_inter_attention_layernorm(layernorm_input) return layernorm_input, layernorm_output - def retro_encoder_cross_attention(self, - retriever_output, - layernorm_input, - layernorm_output, - bias_dropout_add_func): + def retro_encoder_cross_attention( + self, retriever_output, layernorm_input, layernorm_output, bias_dropout_add_func + ): """Cross attention for Retro encoder. Notation: @@ -1087,16 +1277,15 @@ def retro_encoder_cross_attention(self, r : Number of retrieved tokens (neighbors + continuation). """ - ns, bs, d = layernorm_output.shape # [r, bs * l * k, d] + ns, bs, d = layernorm_output.shape # [r, bs * l * k, d] # Divide sequence dimension into chunks. - chunked_outputs = layernorm_output.reshape(self.retro_retrieved_length, - -1, - self.retro_num_neighbors, - d) - chunked_outputs_before_layer_norm = \ - layernorm_input.reshape(self.retro_retrieved_length, -1, - self.retro_num_neighbors, d) # [r, bs*l, k, d] + chunked_outputs = layernorm_output.reshape( + self.retro_retrieved_length, -1, self.retro_num_neighbors, d + ) + chunked_outputs_before_layer_norm = layernorm_input.reshape( + self.retro_retrieved_length, -1, self.retro_num_neighbors, d + ) # [r, bs*l, k, d] # Per-chunk attention. layernorm_inputs = [] @@ -1104,51 +1293,55 @@ def retro_encoder_cross_attention(self, for k in range(self.retro_num_neighbors): # Attention. - chunked_output = chunked_outputs[:,:,k].contiguous() - attention_output, attention_bias = \ - self.inter_attention( - chunked_output, # Q (neighbor embedding) - None, - encoder_output=retriever_output) # K, V (hidden act) + chunked_output = chunked_outputs[:, :, k].contiguous() + attention_output, attention_bias = self.inter_attention( + chunked_output, # Q (neighbor embedding) + None, + encoder_output=retriever_output, + ) # K, V (hidden act) # Residual connection. if self.apply_residual_connection_post_layernorm: residual = chunked_output else: - residual = chunked_outputs_before_layer_norm[:,:,k] + residual = chunked_outputs_before_layer_norm[:, :, k] # Re-enable torch grad to enable fused optimization. with torch.enable_grad(): layernorm_input = bias_dropout_add_func( attention_output, - None if attention_bias is None else attention_bias.expand_as(residual), + ( + None + if attention_bias is None + else attention_bias.expand_as(residual) + ), residual, - self.hidden_dropout) + self.hidden_dropout, + ) layernorm_inputs.append(layernorm_input) # Layer norm. - layernorm_output = \ - self.post_inter_attention_layernorm(layernorm_input) + layernorm_output = self.post_inter_attention_layernorm(layernorm_input) layernorm_outputs.append(layernorm_output) # Concatenate layer norms. # layernorm_input : [r, k * bs * l, d] # layernorm_output : [r, k * bs * l, d] - layernorm_input = \ - torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d) - layernorm_output = \ - torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d) + layernorm_input = torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d) + layernorm_output = torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d) return layernorm_input, layernorm_output - def retro_decoder_cross_attention(self, - retriever_input, - retriever_output, - retriever_attn_mask, - layernorm_input, - layernorm_output, - inference_params, - bias_dropout_add_func): + def retro_decoder_cross_attention( + self, + retriever_input, + retriever_output, + retriever_attn_mask, + layernorm_input, + layernorm_output, + inference_params, + bias_dropout_add_func, + ): """Cross attention for Retro decoder. Notation: @@ -1169,22 +1362,27 @@ def retro_decoder_cross_attention(self, first_ns = ns % self.retro_chunk_length if first_ns > 0: raise Exception("test this case.") - first_chunk, rest_chunk = \ - layernorm_output[:first_ns], layernorm_output[first_ns:] + first_chunk, rest_chunk = ( + layernorm_output[:first_ns], + layernorm_output[first_ns:], + ) first_chunk = torch.nn.functional.pad( first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), - 'constant', - 0) - chunked_output = \ - torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d] + "constant", + 0, + ) + chunked_output = torch.cat( + (first_chunk, rest_chunk), dim=0 + ) # [l * m, bs, d] else: - chunked_output = layernorm_output # [l * m, bs, d] - chunked_output = chunked_output \ - .reshape(l, self.retro_chunk_length, bs, d) \ - .permute(1, 2, 0, 3) \ - .reshape(self.retro_chunk_length, bs * l, d) \ + chunked_output = layernorm_output # [l * m, bs, d] + chunked_output = ( + chunked_output.reshape(l, self.retro_chunk_length, bs, d) + .permute(1, 2, 0, 3) + .reshape(self.retro_chunk_length, bs * l, d) .contiguous() + ) # Get Encoder Output retriever_output = self.retriever( @@ -1192,9 +1390,11 @@ def retro_decoder_cross_attention(self, attention_mask=retriever_attn_mask, retriever_output=chunked_output, retriever_attn_mask=retriever_attn_mask, - inference_params=inference_params) # [r, k * bs * l , d] + inference_params=inference_params, + ) # [r, k * bs * l , d] retriever_output = retriever_output.reshape( - self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d] + self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d + ) # [r * k, bs * l, d] # Chunks. pad = (ns - 1) % self.retro_chunk_length @@ -1202,18 +1402,20 @@ def retro_decoder_cross_attention(self, padded_chunks = torch.nn.functional.pad( attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), - 'constant', 0) - padded_chunked_output = padded_chunks \ - .reshape(l, self.retro_chunk_length, bs, d) \ - .permute(1, 2, 0, 3) + "constant", + 0, + ) + padded_chunked_output = padded_chunks.reshape( + l, self.retro_chunk_length, bs, d + ).permute(1, 2, 0, 3) padded_chunked_output = padded_chunked_output.reshape( - self.retro_chunk_length, bs * l, d).contiguous() + self.retro_chunk_length, bs * l, d + ).contiguous() # Encoder output. - attention_output, attention_bias = \ - self.inter_attention(padded_chunked_output, - None, - encoder_output=retriever_output) + attention_output, attention_bias = self.inter_attention( + padded_chunked_output, None, encoder_output=retriever_output + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1225,17 +1427,27 @@ def retro_decoder_cross_attention(self, with torch.enable_grad(): layernorm_input = bias_dropout_add_func( attention_output, - None if attention_bias is None else attention_bias.expand_as(attention_output), + ( + None + if attention_bias is None + else attention_bias.expand_as(attention_output) + ), torch.zeros_like(attention_output), - self.hidden_dropout) - layernorm_input = layernorm_input \ - .reshape(self.retro_chunk_length, bs, l, d) \ - .permute(2, 0, 1, 3) # [l, m, bs, d] - layernorm_input = layernorm_input.reshape(self.retro_chunk_length * l, bs, d) + self.hidden_dropout, + ) + layernorm_input = layernorm_input.reshape( + self.retro_chunk_length, bs, l, d + ).permute( + 2, 0, 1, 3 + ) # [l, m, bs, d] + layernorm_input = layernorm_input.reshape( + self.retro_chunk_length * l, bs, d + ) layernorm_input = torch.nn.functional.pad( - layernorm_input, - (0, 0, 0, 0, pad, 0), - 'constant', 0)[:ns] # [ns, b, d] + layernorm_input, (0, 0, 0, 0, pad, 0), "constant", 0 + )[ + :ns + ] # [ns, b, d] layernorm_input = layernorm_input + residual # Layer norm post the decoder attention @@ -1243,26 +1455,31 @@ def retro_decoder_cross_attention(self, return retriever_output, layernorm_input, layernorm_output - def forward(self, hidden_states, attention_mask=None, - encoder_output=None, enc_dec_attn_mask=None, - retriever_input=None, - retriever_output=None, - retriever_attn_mask=None, - inference_params=None, - rotary_pos_emb=None, - aggregated_moe_loss=None): + def forward( + self, + hidden_states, + attention_mask=None, + encoder_output=None, + enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + aggregated_moe_loss=None, + ): # hidden_states: [s, b, h] # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. - attention_output, attention_bias = \ - self.self_attention( - layernorm_output, - attention_mask, - inference_params=inference_params, - rotary_pos_emb=rotary_pos_emb) + attention_output, attention_bias = self.self_attention( + layernorm_output, + attention_mask, + inference_params=inference_params, + rotary_pos_emb=rotary_pos_emb, + ) # Residual connection. if self.apply_residual_connection_post_layernorm: @@ -1287,14 +1504,14 @@ def forward(self, hidden_states, attention_mask=None, attention_bias = attention_bias.expand_as(residual) with self.bias_dropout_add_exec_handler(): layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias, - residual, - self.hidden_dropout) + attention_output, attention_bias, residual, self.hidden_dropout + ) else: - out = torch.nn.functional.dropout(attention_output + attention_bias, - p=self.hidden_dropout, - training=self.training) + out = torch.nn.functional.dropout( + attention_output + attention_bias, + p=self.hidden_dropout, + training=self.training, + ) layernorm_input = residual + self.drop_path(out) # Layer norm post the self attention. @@ -1304,23 +1521,25 @@ def forward(self, hidden_states, attention_mask=None, if self.layer_type == LayerType.encoder: pass elif self.layer_type == LayerType.decoder: - layernorm_input, layernorm_output = \ - self.default_decoder_cross_attention( - encoder_output, - enc_dec_attn_mask, - layernorm_input, - layernorm_output, - bias_dropout_add_func) + layernorm_input, layernorm_output = self.default_decoder_cross_attention( + encoder_output, + enc_dec_attn_mask, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ) elif self.layer_type == LayerType.retro_encoder: - layernorm_input, layernorm_output = \ - self.retro_encoder_cross_attention( - retriever_output, - layernorm_input, - layernorm_output, - bias_dropout_add_func) - elif self.layer_type in (LayerType.retro_decoder, - LayerType.retro_decoder_with_retriever): - retriever_output, layernorm_input, layernorm_output = \ + layernorm_input, layernorm_output = self.retro_encoder_cross_attention( + retriever_output, + layernorm_input, + layernorm_output, + bias_dropout_add_func, + ) + elif self.layer_type in ( + LayerType.retro_decoder, + LayerType.retro_decoder_with_retriever, + ): + retriever_output, layernorm_input, layernorm_output = ( self.retro_decoder_cross_attention( retriever_input, retriever_output, @@ -1328,14 +1547,19 @@ def forward(self, hidden_states, attention_mask=None, layernorm_input, layernorm_output, inference_params, - bias_dropout_add_func) + bias_dropout_add_func, + ) + ) else: - raise Exception("Unsupported layer type, '%s'." % - self.layer_type.name) + raise Exception("Unsupported layer type, '%s'." % self.layer_type.name) # MLP. - moe_loss = torch.tensor(0.0, device=layernorm_output.device, dtype=layernorm_output.dtype) - mlp_bias = torch.tensor(0.0, device=layernorm_output.device, dtype=layernorm_output.dtype) + moe_loss = torch.tensor( + 0.0, device=layernorm_output.device, dtype=layernorm_output.dtype + ) + mlp_bias = torch.tensor( + 0.0, device=layernorm_output.device, dtype=layernorm_output.dtype + ) if self.num_experts == 1: mlp_output, mlp_bias = self.mlp(layernorm_output) @@ -1357,10 +1581,8 @@ def forward(self, hidden_states, attention_mask=None, mlp_bias = mlp_bias.expand_as(residual) with self.bias_dropout_add_exec_handler(): output = bias_dropout_add_func( - mlp_output, - mlp_bias, - residual, - self.hidden_dropout) + mlp_output, mlp_bias, residual, self.hidden_dropout + ) # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, @@ -1368,16 +1590,16 @@ def forward(self, hidden_states, attention_mask=None, # won't result in memory savings (like the data loader, or # p2p_communication), it serves to document the origin of this # 'view' tensor. - output = core.utils.make_viewless_tensor(inp = output, - requires_grad = output.requires_grad, - keep_graph = True) + output = core.utils.make_viewless_tensor( + inp=output, requires_grad=output.requires_grad, keep_graph=True + ) else: if mlp_bias is not None: mlp_output = mlp_output + mlp_bias - out = torch.nn.functional.dropout(mlp_output, - p=self.hidden_dropout, - training=self.training) + out = torch.nn.functional.dropout( + mlp_output, p=self.hidden_dropout, training=self.training + ) output = residual + self.drop_path(out) if self.layer_type == LayerType.retro_decoder_with_retriever: @@ -1406,25 +1628,47 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer): If no mask is provided, the module will query `self._args.attn_mask` for the mask and only return `super().forward(...)` """ - def __init__(self, config, - layer_number, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding, - drop_path_rate=0., num_experts=1, - input_aggregated_moe_loss=False, return_aggregated_moe_loss=False): + + def __init__( + self, + config, + layer_number, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.0, + num_experts=1, + input_aggregated_moe_loss=False, + return_aggregated_moe_loss=False, + ): self.input_aggregated_moe_loss = input_aggregated_moe_loss self.return_aggregated_moe_loss = return_aggregated_moe_loss - super().__init__(config, layer_number, layer_type, self_attn_mask_type, drop_path_rate, num_experts) + super().__init__( + config, + layer_number, + layer_type, + self_attn_mask_type, + drop_path_rate, + num_experts, + ) def forward(self, inputs, **kwargs): assert torch.is_tensor(inputs) or isinstance(inputs, tuple) - if not hasattr(self, '_args'): + if not hasattr(self, "_args"): self._args = get_args() - rotary_pos_emb = self._args.rotary_pos_emb if self._args.use_rotary_position_embeddings else None + rotary_pos_emb = ( + self._args.rotary_pos_emb + if self._args.use_rotary_position_embeddings + else None + ) if torch.is_tensor(inputs) or len(inputs) == 1: - assert not self.input_aggregated_moe_loss, f'Expecting an input tuple of size >= 2' + assert ( + not self.input_aggregated_moe_loss + ), f"Expecting an input tuple of size >= 2" # No attention mask forwarded, search for args.attn_mask hidden_states, attention_mask = inputs, self._args.attn_mask - output, moe_loss = super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb) + output, moe_loss = super().forward( + hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb + ) return (output, moe_loss) if self.return_aggregated_moe_loss else output elif len(inputs) in (2, 3): # Attention mask and aggregated_moe can both be activations. @@ -1437,22 +1681,28 @@ def forward(self, inputs, **kwargs): hidden_states, attention_mask = inputs[0], inputs[1] return_attention_mask = True else: - hidden_states, attention_mask, aggregated_moe_loss = inputs[0], inputs[1], inputs[2] + hidden_states, attention_mask, aggregated_moe_loss = ( + inputs[0], + inputs[1], + inputs[2], + ) # Forward aggregated_moe_loss to ParallelTransformerLayer for further accumulation if self.input_aggregated_moe_loss: - kwargs.update({'aggregated_moe_loss': aggregated_moe_loss}) + kwargs.update({"aggregated_moe_loss": aggregated_moe_loss}) - output, moe_loss = super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb) + output, moe_loss = super().forward( + hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb + ) - ret = (output, ) + ret = (output,) if return_attention_mask: - ret += (attention_mask, ) + ret += (attention_mask,) if self.return_aggregated_moe_loss: - ret += (moe_loss, ) + ret += (moe_loss,) return ret else: - raise RuntimeError('Received more inputs than understood.') + raise RuntimeError("Received more inputs than understood.") class NoopTransformerLayer(MegatronModule): @@ -1475,15 +1725,20 @@ def __init__(self, layer_number): super().__init__() self.layer_number = layer_number - def forward(self, hidden_states, attention_mask, - encoder_output=None, enc_dec_attn_mask=None, - inference_params=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + inference_params=None, + ): return hidden_states.clone() def _get_num_layers(args, model_type, is_decoder=False): """Compute the number of transformer layers resident on the current rank.""" - is_encoder_and_decoder_model = (model_type == ModelType.encoder_and_decoder) + is_encoder_and_decoder_model = model_type == ModelType.encoder_and_decoder if model_type == ModelType.retro_encoder: num_layers = args.retro_encoder_layers elif parallel_state.get_pipeline_model_parallel_world_size() > 1: @@ -1496,27 +1751,34 @@ def _get_num_layers(args, model_type, is_decoder=False): # the same whether or not a standalone embedding stage is used. num_ranks_in_encoder = ( args.pipeline_model_parallel_split_rank - 1 - if args.standalone_embedding_stage else - args.pipeline_model_parallel_split_rank + if args.standalone_embedding_stage + else args.pipeline_model_parallel_split_rank + ) + num_ranks_in_decoder = ( + args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder + ) + assert args.encoder_num_layers % num_ranks_in_encoder == 0, ( + "encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)" + % (args.encoder_num_layers, num_ranks_in_encoder) + ) + assert args.decoder_num_layers % num_ranks_in_decoder == 0, ( + "decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)" + % (args.decoder_num_layers, num_ranks_in_decoder) ) - num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder - assert args.encoder_num_layers % num_ranks_in_encoder == 0, \ - 'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder) - assert args.decoder_num_layers % num_ranks_in_decoder == 0, \ - 'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder) if parallel_state.is_pipeline_stage_before_split(): num_layers = ( 0 if args.standalone_embedding_stage - and parallel_state.get_pipeline_model_parallel_rank() == 0 else - args.encoder_num_layers // num_ranks_in_encoder + and parallel_state.get_pipeline_model_parallel_rank() == 0 + else args.encoder_num_layers // num_ranks_in_encoder ) else: num_layers = args.decoder_num_layers // num_ranks_in_decoder else: assert args.num_layers == args.encoder_num_layers - assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \ - 'num_layers must be divisible by transformer_pipeline_model_parallel_size' + assert ( + args.num_layers % args.transformer_pipeline_model_parallel_size == 0 + ), "num_layers must be divisible by transformer_pipeline_model_parallel_size" # When a standalone embedding stage is used, all transformer layers # are divided among pipeline rank >= 1, while on pipeline rank 0, @@ -1525,8 +1787,8 @@ def _get_num_layers(args, model_type, is_decoder=False): num_layers = ( 0 if args.standalone_embedding_stage - and parallel_state.get_pipeline_model_parallel_rank() == 0 else - args.num_layers // args.transformer_pipeline_model_parallel_size + and parallel_state.get_pipeline_model_parallel_rank() == 0 + else args.num_layers // args.transformer_pipeline_model_parallel_size ) else: if not is_decoder: @@ -1536,14 +1798,15 @@ def _get_num_layers(args, model_type, is_decoder=False): return num_layers -def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, - layer_number): +def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, layer_number): args = get_args() if args.retro_add_retriever and layer_number in retro_layer_numbers: if model_type == ModelType.retro_decoder: - return LayerType.retro_decoder_with_retriever \ - if layer_number == retro_layer_numbers[0] \ - else LayerType.retro_decoder + return ( + LayerType.retro_decoder_with_retriever + if layer_number == retro_layer_numbers[0] + else LayerType.retro_decoder + ) elif model_type == ModelType.retro_encoder: return LayerType.retro_encoder else: @@ -1552,15 +1815,22 @@ def _get_layer_type(model_type, default_layer_type, retro_layer_numbers, return default_layer_type -def get_num_experts_per_layer(num_experts: list, num_layers: int, expert_interval: int, offset: int = 0) -> list: - assert len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval, \ - 'num_experts must be either a single value or a list of the same length as the number of MoE layers' +def get_num_experts_per_layer( + num_experts: list, num_layers: int, expert_interval: int, offset: int = 0 +) -> list: + assert ( + len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval + ), "num_experts must be either a single value or a list of the same length as the number of MoE layers" if len(num_experts) == 1: num_experts = num_experts * (num_layers // expert_interval) experts_per_layer = [] for i in range(num_layers): layer_num = i + 1 + offset - n_e = num_experts[(layer_num-1) // expert_interval] if layer_num % expert_interval == 0 else 1 + n_e = ( + num_experts[(layer_num - 1) // expert_interval] + if layer_num % expert_interval == 0 + else 1 + ) experts_per_layer.append(n_e) return experts_per_layer @@ -1568,14 +1838,18 @@ def get_num_experts_per_layer(num_experts: list, num_layers: int, expert_interva class ParallelTransformer(MegatronModule): """Transformer class.""" - def __init__(self, config, - model_type, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding, - post_layer_norm=True, - pre_process=True, - post_process=True, - drop_path_rate=0.0, - num_experts=[1]): + def __init__( + self, + config, + model_type, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + post_layer_norm=True, + pre_process=True, + post_process=True, + drop_path_rate=0.0, + num_experts=[1], + ): super(ParallelTransformer, self).__init__() args = get_args() @@ -1598,14 +1872,15 @@ def __init__(self, config, self.recompute_granularity = config.recompute_granularity self.recompute_method = config.recompute_method self.recompute_num_layers = config.recompute_num_layers - self.distribute_saved_activations = \ + self.distribute_saved_activations = ( config.distribute_saved_activations and not config.sequence_parallel + ) self.sequence_parallel = config.sequence_parallel # Transformer Engine Init. self.transformer_engine_rope_available = False - if self.transformer_impl == 'transformer_engine': + if self.transformer_impl == "transformer_engine": global transformer_engine import transformer_engine from importlib.metadata import version @@ -1637,45 +1912,53 @@ def __init__(self, config, self.num_microbatches_in_previous_step = -1 self.microbatch_count = 0 - self.checkpoint_core_attention = config.recompute_granularity == 'selective' + self.checkpoint_core_attention = config.recompute_granularity == "selective" # Number of layers. - self.num_layers = _get_num_layers(args, model_type, - layer_type==LayerType.decoder) + self.num_layers = _get_num_layers( + args, model_type, layer_type == LayerType.decoder + ) self.drop_path_rates = [ - rate.item() for rate in - torch.linspace(0, self.drop_path_rate, config.num_layers)] + rate.item() + for rate in torch.linspace(0, self.drop_path_rate, config.num_layers) + ] self.retro_layer_numbers = None if model_type == ModelType.retro_decoder: retro_layer_start = 6 if config.num_layers <= 15 else 9 - self.retro_layer_numbers = \ - np.arange(retro_layer_start, args.num_layers + 1, 3).tolist() + self.retro_layer_numbers = np.arange( + retro_layer_start, args.num_layers + 1, 3 + ).tolist() if model_type == ModelType.retro_encoder: self.retro_layer_numbers = [1] # Transformer layers. if args.retro_add_retriever: - assert self.recompute_granularity != 'full', \ - "Full recompute not supported for Retro." - assert args.transformer_impl == 'local', \ - "Transformer engine does not support Retro layers." + assert ( + self.recompute_granularity != "full" + ), "Full recompute not supported for Retro." + assert ( + args.transformer_impl == "local" + ), "Transformer engine does not support Retro layers." + def build_layer(layer_number, n_e): - if args.transformer_impl == 'local': + if args.transformer_impl == "local": current_layer_type = _get_layer_type( - model_type, layer_type, self.retro_layer_numbers, - layer_number) + model_type, layer_type, self.retro_layer_numbers, layer_number + ) return ParallelTransformerLayer( config, layer_number, layer_type=current_layer_type, self_attn_mask_type=self_attn_mask_type, drop_path_rate=self.drop_path_rates[layer_number - 1], - num_experts=n_e) + num_experts=n_e, + ) else: - assert config.num_attention_heads == config.num_key_value_heads, \ - 'Transformer_engine does not support GQA' + assert ( + config.num_attention_heads == config.num_key_value_heads + ), "Transformer_engine does not support GQA" return transformer_engine.pytorch.TransformerLayer( config.hidden_size, config.ffn_hidden_size, @@ -1702,16 +1985,22 @@ def build_layer(layer_number, n_e): layer_type="encoder", drop_path_rate=self.drop_path_rates[layer_number - 1], set_parallel_mode=True, - fuse_qkv_params=True) + fuse_qkv_params=True, + ) if config.virtual_pipeline_model_parallel_size is not None: - assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \ - 'num_layers_per_stage must be divisible by ' \ - 'virtual_pipeline_model_parallel_size' + assert ( + config.num_layers % config.virtual_pipeline_model_parallel_size == 0 + ), ( + "num_layers_per_stage must be divisible by " + "virtual_pipeline_model_parallel_size" + ) assert args.model_type != ModelType.encoder_and_decoder # Number of layers in each model chunk is the number of layers in the stage, # divided by the number of model chunks in a stage. - self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size + self.num_layers = ( + self.num_layers // config.virtual_pipeline_model_parallel_size + ) # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of # layers to stages like (each list is a model chunk): # Stage 0: [0] [2] [4] [6] @@ -1721,12 +2010,14 @@ def build_layer(layer_number, n_e): # Stage 0: [0, 1] [4, 5] # Stage 1: [2, 3] [6, 7] offset = parallel_state.get_virtual_pipeline_model_parallel_rank() * ( - config.num_layers // config.virtual_pipeline_model_parallel_size) + \ - (parallel_state.get_pipeline_model_parallel_rank() * self.num_layers) + config.num_layers // config.virtual_pipeline_model_parallel_size + ) + (parallel_state.get_pipeline_model_parallel_rank() * self.num_layers) else: # Each stage gets a contiguous set of layers. - if args.model_type == ModelType.encoder_and_decoder and \ - parallel_state.get_pipeline_model_parallel_world_size() > 1: + if ( + args.model_type == ModelType.encoder_and_decoder + and parallel_state.get_pipeline_model_parallel_world_size() > 1 + ): pipeline_rank = parallel_state.get_pipeline_model_parallel_rank() if layer_type == LayerType.encoder: offset = pipeline_rank * self.num_layers @@ -1734,7 +2025,9 @@ def build_layer(layer_number, n_e): num_ranks_in_enc = args.pipeline_model_parallel_split_rank offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers else: - offset = parallel_state.get_pipeline_model_parallel_rank() * self.num_layers + offset = ( + parallel_state.get_pipeline_model_parallel_rank() * self.num_layers + ) if self.num_layers == 0: # When a standalone embedding stage is used (e.g., @@ -1746,11 +2039,13 @@ def build_layer(layer_number, n_e): # this, we assign a 'no-op' layer on these ranks, which will # disconnect the input tensor from the output tensor. self.num_layers = 1 - self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ]) + self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) else: # Build the layers self.layers = [] - experts_per_layer = get_num_experts_per_layer(num_experts, self.num_layers, args.expert_interval, offset) + experts_per_layer = get_num_experts_per_layer( + num_experts, self.num_layers, args.expert_interval, offset + ) for i in range(self.num_layers): layer_num = i + 1 + offset n_e = experts_per_layer[i] @@ -1761,41 +2056,54 @@ def build_layer(layer_number, n_e): if model_type == ModelType.retro_encoder: for layer in self.layers: if layer.self_attention.use_flash_attn: - layer.self_attention.core_attention_flash.dropout_p = \ + layer.self_attention.core_attention_flash.dropout_p = ( torch.nn.Dropout(args.retro_encoder_attention_dropout) + ) else: - layer.self_attention.core_attention.attention_dropout.p =\ + layer.self_attention.core_attention.attention_dropout.p = ( args.retro_encoder_attention_dropout + ) layer.hidden_dropout = args.retro_encoder_hidden_dropout if self.post_process and self.post_layer_norm: # Final layer norm before output. - if args.normalization == 'layernorm': - if get_accelerator().device_name() == 'cuda': + if args.normalization == "layernorm": + if get_accelerator().device_name() == "cuda": self.final_layernorm = LayerNorm( config.hidden_size, eps=config.layernorm_epsilon, no_persist_layer_norm=args.no_persist_layer_norm, sequence_parallel=config.sequence_parallel, apply_layernorm_1p=args.apply_layernorm_1p, - mem_efficient_ln=args.mem_efficient_ln) + mem_efficient_ln=args.mem_efficient_ln, + ) else: self.final_layernorm = LayerNorm( - config.hidden_size, - eps=config.layernorm_epsilon) + config.hidden_size, eps=config.layernorm_epsilon + ) else: - self.final_layernorm = RMSNorm(config.hidden_size, config.layernorm_epsilon, - sequence_parallel=config.sequence_parallel) + self.final_layernorm = RMSNorm( + config.hidden_size, + config.layernorm_epsilon, + sequence_parallel=config.sequence_parallel, + ) def _get_layer(self, layer_number): return self.layers[layer_number] - def _checkpointed_forward(self, hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - rotary_pos_emb, is_first_microbatch): + def _checkpointed_forward( + self, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ): args = get_args() """Forward method with activation checkpointing.""" + def custom(start, end): def custom_forward(*args, **kwargs): x_, *args = args @@ -1807,11 +2115,14 @@ def custom_forward(*args, **kwargs): x_, moe_loss = output else: x_ = output - moe_loss = torch.tensor(0.0, device=x_.device, dtype=x_.dtype, requires_grad=True) + moe_loss = torch.tensor( + 0.0, device=x_.device, dtype=x_.dtype, requires_grad=True + ) moe_losses.append(moe_loss) return (x_, *moe_losses) + return custom_forward - + if args.deepspeed and args.deepspeed_activation_checkpointing: moe_losses = [] # Make sure memory is freed. @@ -1819,9 +2130,18 @@ def custom_forward(*args, **kwargs): l = 0 while l < self.num_layers: hidden_states, *local_moe_losses = tensor_parallel.checkpoint( - custom(l, l + self.checkpoint_num_layers), False, - hidden_states, attention_mask, encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + custom(l, l + self.checkpoint_num_layers), + False, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) moe_losses.extend(local_moe_losses) l += self.checkpoint_num_layers @@ -1829,66 +2149,105 @@ def custom_forward(*args, **kwargs): else: moe_losses = [] te_forward_kwargs = {} - if self.transformer_impl == 'transformer_engine': - te_forward_kwargs['is_first_microbatch'] = is_first_microbatch + if self.transformer_impl == "transformer_engine": + te_forward_kwargs["is_first_microbatch"] = is_first_microbatch if self.transformer_engine_rope_available: - te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + te_forward_kwargs["rotary_pos_emb"] = rotary_pos_emb - if self.recompute_method == 'uniform': + if self.recompute_method == "uniform": # Uniformly divide the total number of Transformer layers and # checkpoint the input activation of each divided chunk. # A method to further reduce memory usage reducing checkpoints. l = 0 while l < self.num_layers: - if self.transformer_impl == 'transformer_engine': - hidden_states, *local_moe_losses = transformer_engine.pytorch.distributed.checkpoint( - custom(l, l + self.recompute_num_layers), - self.distribute_saved_activations, - tensor_parallel.get_cuda_rng_tracker, - mpu.get_tensor_model_parallel_group(), - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + if self.transformer_impl == "transformer_engine": + hidden_states, *local_moe_losses = ( + transformer_engine.pytorch.distributed.checkpoint( + custom(l, l + self.recompute_num_layers), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) + ) else: hidden_states, *local_moe_losses = tensor_parallel.checkpoint( custom(l, l + self.recompute_num_layers), self.distribute_saved_activations, - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) moe_losses.extend(local_moe_losses) l += self.recompute_num_layers - elif self.recompute_method == 'block': + elif self.recompute_method == "block": # Checkpoint the input activation of only a set number of individual # Transformer layers and skip the rest. # A method fully use the device memory removing redundant re-computation. for l in range(self.num_layers): if l < self.recompute_num_layers: - if self.transformer_impl == 'transformer_engine': - hidden_states, *local_moe_losses = transformer_engine.pytorch.distributed.checkpoint( - custom(l, l + 1), - self.distribute_saved_activations, - tensor_parallel.get_cuda_rng_tracker, - mpu.get_tensor_model_parallel_group(), - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + if self.transformer_impl == "transformer_engine": + hidden_states, *local_moe_losses = ( + transformer_engine.pytorch.distributed.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + tensor_parallel.get_cuda_rng_tracker, + mpu.get_tensor_model_parallel_group(), + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) + ) else: - hidden_states, *local_moe_losses = tensor_parallel.checkpoint( - custom(l, l + 1), - self.distribute_saved_activations, - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) + hidden_states, *local_moe_losses = ( + tensor_parallel.checkpoint( + custom(l, l + 1), + self.distribute_saved_activations, + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) + ) else: - if self.transformer_impl == 'transformer_engine': + if self.transformer_impl == "transformer_engine": hidden_states, *local_moe_losses = custom(l, l + 1)( - hidden_states, attention_mask, encoder_output, - enc_dec_attn_mask, **te_forward_kwargs) + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + **te_forward_kwargs, + ) else: hidden_states, *local_moe_losses = custom(l, l + 1)( - hidden_states, attention_mask, - encoder_output, enc_dec_attn_mask, - None, None, None, None, rotary_pos_emb) - + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + None, + None, + None, + None, + rotary_pos_emb, + ) + moe_losses.extend(local_moe_losses) else: raise ValueError("Invalid activation recompute method.") @@ -1904,19 +2263,25 @@ def set_input_tensor(self, input_tensor): forward_step_func""" self.input_tensor = input_tensor - def forward(self, hidden_states, attention_mask, - encoder_output=None, enc_dec_attn_mask=None, - retriever_input=None, - retriever_output=None, - retriever_attn_mask=None, - inference_params=None, - rotary_pos_emb=None): + def forward( + self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + retriever_input=None, + retriever_output=None, + retriever_attn_mask=None, + inference_params=None, + rotary_pos_emb=None, + ): # hidden_states: [s, b, h] # Checks. if inference_params: - assert self.recompute_granularity is None, \ - 'inference does not work with activation checkpointing' + assert ( + self.recompute_granularity is None + ), "inference does not work with activation checkpointing" # TODO: Below old DeepSpeed code are commented because it's unsure whether # it is still relevant. @@ -1971,64 +2336,77 @@ def forward(self, hidden_states, attention_mask, with rng_context: # The fp8_autocast context manager is a no-op when enabled=True # The if...else serves to short circuit name resolution for fp8_autocast - with transformer_engine.pytorch.fp8_autocast( - enabled=self.use_fp8, - fp8_recipe=self.fp8_recipe, - fp8_group=self.fp8_group - ) if self.use_fp8 else nullcontext(): + with ( + transformer_engine.pytorch.fp8_autocast( + enabled=self.use_fp8, + fp8_recipe=self.fp8_recipe, + fp8_group=self.fp8_group, + ) + if self.use_fp8 + else nullcontext() + ): # Determine if the current iteration is first microbatch if self.num_microbatches_in_previous_step != get_num_microbatches(): - self.microbatch_count = 0 # Reset count on new batch size rampup interval + self.microbatch_count = ( + 0 # Reset count on new batch size rampup interval + ) self.num_microbatches_in_previous_step = get_num_microbatches() - is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0 + is_first_microbatch = ( + self.microbatch_count % get_num_microbatches() == 0 + ) # Forward pass. moe_losses = [] if self.checkpoint_activations: - hidden_states, moe_losses = self._checkpointed_forward(hidden_states, - attention_mask, - encoder_output, - enc_dec_attn_mask, - rotary_pos_emb, - is_first_microbatch) - elif self.recompute_granularity == 'full': - hidden_states, moe_losses = self._checkpointed_forward(hidden_states, - attention_mask, - encoder_output, - enc_dec_attn_mask, - rotary_pos_emb, - is_first_microbatch) + hidden_states, moe_losses = self._checkpointed_forward( + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ) + elif self.recompute_granularity == "full": + hidden_states, moe_losses = self._checkpointed_forward( + hidden_states, + attention_mask, + encoder_output, + enc_dec_attn_mask, + rotary_pos_emb, + is_first_microbatch, + ) else: forward_kwargs = { - 'encoder_output': encoder_output, - 'enc_dec_attn_mask': enc_dec_attn_mask, - 'inference_params': inference_params, + "encoder_output": encoder_output, + "enc_dec_attn_mask": enc_dec_attn_mask, + "inference_params": inference_params, } - if self.transformer_impl == 'transformer_engine': - forward_kwargs['is_first_microbatch'] = is_first_microbatch - forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention + if self.transformer_impl == "transformer_engine": + forward_kwargs["is_first_microbatch"] = is_first_microbatch + forward_kwargs["checkpoint_core_attention"] = ( + self.checkpoint_core_attention + ) if self.transformer_engine_rope_available: - forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + forward_kwargs["rotary_pos_emb"] = rotary_pos_emb else: - forward_kwargs['rotary_pos_emb'] = rotary_pos_emb - forward_kwargs['retriever_input'] = retriever_input - forward_kwargs['retriever_output'] = retriever_output - forward_kwargs['retriever_attn_mask'] = retriever_attn_mask + forward_kwargs["rotary_pos_emb"] = rotary_pos_emb + forward_kwargs["retriever_input"] = retriever_input + forward_kwargs["retriever_output"] = retriever_output + forward_kwargs["retriever_attn_mask"] = retriever_attn_mask for index in range(self.num_layers): layer = self._get_layer(index) hidden_states = layer( - hidden_states, - attention_mask, - **forward_kwargs) + hidden_states, attention_mask, **forward_kwargs + ) # First Retro decoder layer returns both hidden_states # and retriever_output. Make retriever_output available # to subsequence Retro layers. if isinstance(hidden_states, tuple): - assert (len(hidden_states) == 2 or len(hidden_states) == 3) + assert len(hidden_states) == 2 or len(hidden_states) == 3 if len(hidden_states) == 2: if not self.ds_inference: hidden_states, moe_loss = hidden_states @@ -2054,6 +2432,7 @@ def forward(self, hidden_states, attention_mask, return (hidden_states, *moe_losses) + class LMHeadPipe(MegatronModule): """ Arguments: @@ -2067,11 +2446,13 @@ class LMHeadPipe(MegatronModule): def __init__(self, hidden_size, vocab_size, config): args = get_args() super(LMHeadPipe, self).__init__() - self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=hidden_size, - output_size=vocab_size, - bias=False, - config=config, - init_method=config.init_method,) + self.lm_head = tensor_parallel.ColumnParallelLinear( + input_size=hidden_size, + output_size=vocab_size, + bias=False, + config=config, + init_method=config.init_method, + ) def forward(self, inputs, **kwargs): assert torch.is_tensor(inputs) or isinstance(inputs, tuple) @@ -2080,10 +2461,10 @@ def forward(self, inputs, **kwargs): else: hidden_states = inputs - if not hasattr(self, '_args'): + if not hasattr(self, "_args"): self._args = get_args() - if hasattr(self._args, 'attn_mask'): + if hasattr(self._args, "attn_mask"): attention_mask = None else: attention_mask = inputs[1] @@ -2091,7 +2472,7 @@ def forward(self, inputs, **kwargs): logits, _ = self.lm_head(hidden_states) # If cmd args has attn_mask, we don't forward it as an activation. - if hasattr(self._args, 'attn_mask'): + if hasattr(self._args, "attn_mask"): return logits else: return logits, attention_mask diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 12a458375d..99145ff4f4 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,13 +1,9 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.optimizers import FusedAdam as Adam - from apex.optimizers import FusedSGD as SGD -else: - from torch.optim import Adam - from torch.optim import SGD +import torch +from typing import Callable, Any, Iterable, Union from megatron import get_args from .distrib_optimizer import DistributedOptimizer @@ -15,19 +11,60 @@ from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer -def get_param_groups(modules, - no_weight_decay_cond, - scale_lr_cond, - lr_mult): - """creates param groups based on weight decay condition (regularized vs non regularized) - and learning rate scale condition (args.lr vs lr_mult * args.lr) - scale_lr_cond is used during finetuning where head of the network requires a scaled - version of the base learning rate. +import ezpz as ez +RANK = ez.get_rank() + + +def get_param_groups( + modules: Union[torch.nn.Module, Iterable[torch.nn.Module]], + no_weight_decay_cond: Callable[[str, torch.Tensor], bool], + scale_lr_cond: Callable[[str, torch.Tensor], bool], + lr_mult: Any, + use_galore: bool = False, +): + """ + Creates param groups (regularized vs non) based on: + + - weight decay condition. + - learning rate scale condition (args.lr vs lr_mult * args.lr) + - scale_lr_cond is used during finetuning, where head of the network + requires a scaled version of the base learning rate. + # if 'galore' in args.optimizer.lower(): + # # make parameters with "rank" to a single group, if param_name has "mlp" or "attn" + # galore_params = [] + # target_modules_list = ["attn", "mlp"] + # # for module_name, module in param_groups: + # for group_id, group in enumerate(param_groups): + # for param, p in enumerate(group['params']): + # if not isinstance(module, torch.nn.Linear): + # continue + # if not any(target_key in module_name for target_key in target_modules_list): + # continue + # print('enable GaLore for weights in module: ', module_name) + # galore_params.append(module.weight) + # id_galore_params = [id(p) for p in galore_params] + # # make parameters without "rank" to another group + # regular_params = [p for p in param_groups if id(p) not in id_galore_params] + # # then call galore_adamw + # param_groups = [ + # { + # 'params': regular_params + # }, + # { + # 'params': galore_params, + # 'rank': RANK, + # 'update_proj_gap': args.update_proj_gap, + # 'scale': args.galore_scale, + # 'proj_type': args.proj_type + # } + # ] """ wd_no_scale_lr = [] wd_scale_lr = [] no_wd_no_scale_lr = [] no_wd_scale_lr = [] + galore_params = [] + target_modules_list = ["attn", "mlp"] for module in modules: for name, param in module.named_parameters(): if not param.requires_grad: @@ -65,20 +102,30 @@ def get_param_groups(modules, return param_groups -def get_megatron_optimizer(model, - no_weight_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0): + +def get_megatron_optimizer( + model, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0 +): args = get_args() + assert args is not None # Base optimizer. - param_groups = get_param_groups(model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult) + param_groups = get_param_groups( + model, + no_weight_decay_cond, + scale_lr_cond, + lr_mult + ) if args.create_moe_param_group: - from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer - param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups) + from deepspeed.moe.utils import ( + split_params_into_different_moe_groups_for_optimizer + ) + param_groups = split_params_into_different_moe_groups_for_optimizer( + param_groups + ) if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' @@ -87,45 +134,209 @@ def get_megatron_optimizer(model, else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam - optimizer = cpu_adam_optimizer(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - else: - if args.optimizer == 'adam': - if args.ds_fused_adam: - global Adam - from deepspeed.ops.adam import FusedAdam - Adam = FusedAdam - optimizer = Adam(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) - else: - raise Exception('{} optimizer is not supported.'.format( - args.optimizer)) + optimizer = cpu_adam_optimizer( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif args.optimizer.lower() == "galore_adamw": + from galore_torch import GaLoreAdamW, GaLoreAdamW8bit + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw": + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + # implement adafactor + elif args.optimizer.lower() == "adafactor": + import transformers + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = transformers.optimization.Adafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # low-rank adafactor + elif args.optimizer.lower() == "galore_adafactor": + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = GaLoreAdafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # 8-bit Adam + elif args.optimizer.lower() == "adam8bit": + import bitsandbytes as bnb + optimizer = bnb.optim.Adam8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw8bit": + optimizer = GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == 'galore_adamw8bit_per_layer': + # TODO: seems scheduler call twice in one update step, need to check, for now double the num_training_steps, warmup_steps and update_proj_gap + optimizer_dict = {} + for p in model.parameters(): + if p.requires_grad: + if id(p) in id_galore_params: + optimizer_dict[p] = GaLoreAdamW8bit([{'params': [p], 'rank': args.rank, 'update_proj_gap': args.update_proj_gap * 2, 'scale': args.galore_scale, 'proj_type': args.proj_type}], lr=args.lr, weight_decay=args.weight_decay) + else: + optimizer_dict[p] = bnb.optim.Adam8bit([p], lr=args.lr, weight_decay=args.weight_decay) + # get scheduler dict + scheduler_dict = {} + from galore_torch.peft_pretraining import training_utils + for p in model.parameters(): + if p.requires_grad: + scheduler_dict[p] = training_utils.get_scheculer( + optimizer=optimizer_dict[p], + scheduler_type=args.scheduler, + num_training_steps=args.num_training_steps * 2, + warmup_steps=args.warmup_steps * 2, + min_lr_ratio=args.min_lr_ratio, + ) + + def optimizer_hook(p): + if p.grad is None: + return + optimizer_dict[p].step() + optimizer_dict[p].zero_grad() + scheduler_dict[p].step() + # Register the hook onto every parameter + for p in model.parameters(): + if p.requires_grad: + p.register_post_accumulate_grad_hook(optimizer_hook) + layer_wise_flag = True + elif str(args.optimizer) == 'ipex.lamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer) == 'ipex.fusedlamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + fused=True, + ) + elif str(args.optimizer).lower() == 'ds.fusedlamb': + from deepspeed.ops.lamb import FusedLamb + optimizer = FusedLamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer).lower() == 'adamwschedulefree': + import schedulefree + optimizer = schedulefree.AdamWScheduleFree( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'sgdschedulefree': + import schedulefree + optimizer = schedulefree.SGDScheduleFree( + param_groups, + lr=args.lr, + momentum=args.sgd_momentum, + weight_decay=args.weight_decay, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'apex.adam': + assert get_accelerator().device_name() == 'cuda' + from apex.optimizers import FusedAdam as Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif str(args.optimizer).lower() == 'apex.sgd': + from apex.optimizers import FusedSGD as SGD + optimizer = SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'adamw': + optimizer = torch.optim.AdamW( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'adam': + if args.ds_fused_adam: + # global Adam + from deepspeed.ops.adam import FusedAdam + Adam = FusedAdam + else: + Adam = torch.optim.Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'sgd': + optimizer = torch.optim.SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'sophiag': + from .sophia import SophiaG + optimizer = SophiaG( + param_groups, + lr=args.lr, + betas=(args.sophiag_beta1, args.sophiag_beta2), + rho = args.sophiag_rho, + weight_decay=args.weight_decay + ) + else: + raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer - # Determine whether the params have main-grad field. params_have_main_grad = False if args.use_contiguous_buffers_in_local_ddp: params_have_main_grad = True - # Mixed precision optimizer. # - Note: both the Float16Optimizer and the DistributedOptimizer inherit # from the MixedPrecisionOptimizer, which manages any optimizer where # the model params and main params are distinct. if args.fp16 or args.bf16 or args.use_distributed_optimizer: - # Grad scaler: # if loss-scale is provided, instantiate the constant scaler. # if we are using fp16 and loss-scale is not present, use a @@ -133,11 +344,9 @@ def get_megatron_optimizer(model, # otherwise we are running in bf16 with no loss-scale so # leave it as None. grad_scaler = None - # Constant loss scale. if args.loss_scale: grad_scaler = ConstantGradScaler(args.loss_scale) - # Dynamic loss scale. else: if args.fp16: @@ -148,11 +357,11 @@ def get_megatron_optimizer(model, backoff_factor=0.5, growth_interval=args.loss_scale_window, hysteresis=args.hysteresis) - # Megatron optimizer. - opt_ty = DistributedOptimizer \ - if args.use_distributed_optimizer else \ - Float16OptimizerWithFloat16Params + opt_ty = ( + DistributedOptimizer if args.use_distributed_optimizer + else Float16OptimizerWithFloat16Params + ) return opt_ty(optimizer, args.clip_grad, args.log_num_zeros_in_grad, @@ -163,10 +372,12 @@ def get_megatron_optimizer(model, args.params_dtype, grad_scaler, model) - # FP32. - return FP32Optimizer(optimizer, args.clip_grad, - args.log_num_zeros_in_grad, - params_have_main_grad, - args.use_contiguous_buffers_in_local_ddp, - model) + return FP32Optimizer( + optimizer, + args.clip_grad, + args.log_num_zeros_in_grad, + params_have_main_grad, + args.use_contiguous_buffers_in_local_ddp, + model + ) diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index afec8f220c..b5141d0059 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -8,10 +8,14 @@ except ModuleNotFoundError: from torch import inf as inf -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.multi_tensor_apply import multi_tensor_applier import amp_C + HAS_APEX = True +except Exception: + HAS_APEX = False from megatron.model.module import param_is_not_shared from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate @@ -71,7 +75,7 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, else: if norm_type == 2.0: - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == 'cuda' and HAS_APEX: dummy_overflow_buf = torch.cuda.IntTensor([0]) # Use apex's multi-tensor applier for efficiency reasons. # Multi-tensor applier takes a function and a list of list diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py index 1aeeac3444..10331607d9 100644 --- a/megatron/optimizer/distrib_optimizer.py +++ b/megatron/optimizer/distrib_optimizer.py @@ -2,10 +2,11 @@ """Megatron distributed optimizer.""" -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.optimizers import FusedAdam as Adam -else: +except Exception: from torch.optim import Adam import math diff --git a/megatron/optimizer/sophia.py b/megatron/optimizer/sophia.py new file mode 100644 index 0000000000..4c4e074790 --- /dev/null +++ b/megatron/optimizer/sophia.py @@ -0,0 +1,202 @@ +import math +import torch +from torch import Tensor +from torch.optim.optimizer import Optimizer +from typing import List, Optional + + +#SOphiaG implementation from https://github.com/Liuhong99/Sophia/blob/main/sophia.py, copy pasted here because no pip and not sure about submodules + +class SophiaG(Optimizer): + def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04, + weight_decay=1e-1, *, maximize: bool = False, + capturable: bool = False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= rho: + raise ValueError("Invalid rho parameter at index 1: {}".format(rho)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, rho=rho, + weight_decay=weight_decay, + maximize=maximize, capturable=capturable) + super(SophiaG, self).__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('maximize', False) + group.setdefault('capturable', False) + state_values = list(self.state.values()) + step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) + if not step_is_tensor: + for s in state_values: + s['step'] = torch.tensor(float(s['step'])) + + @torch.no_grad() + def update_hessian(self): + for group in self.param_groups: + beta1, beta2 = group['betas'] + for p in group['params']: + if p.grad is None: + continue + state = self.state[p] + + if len(state) == 0: + state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \ + if self.defaults['capturable'] else torch.tensor(0.) + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + if 'hessian' not in state.keys(): + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2) + + + @torch.no_grad() + def step(self, closure=None, bs=5120): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + state_steps = [] + hessian = [] + beta1, beta2 = group['betas'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + + if p.grad.is_sparse: + raise RuntimeError('Hero does not support sparse gradients') + grads.append(p.grad) + state = self.state[p] + # State initialization + if len(state) == 0: + state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \ + if self.defaults['capturable'] else torch.tensor(0.) + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + if 'hessian' not in state.keys(): + state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + exp_avgs.append(state['exp_avg']) + state_steps.append(state['step']) + hessian.append(state['hessian']) + + if self.defaults['capturable']: + bs = torch.ones((1,), dtype=torch.float, device=p.device) * bs + + sophiag(params_with_grad, + grads, + exp_avgs, + hessian, + state_steps, + bs=bs, + beta1=beta1, + beta2=beta2, + rho=group['rho'], + lr=group['lr'], + weight_decay=group['weight_decay'], + maximize=group['maximize'], + capturable=group['capturable']) + + return loss + +def sophiag(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + hessian: List[Tensor], + state_steps: List[Tensor], + capturable: bool = False, + *, + bs: int, + beta1: float, + beta2: float, + rho: float, + lr: float, + weight_decay: float, + maximize: bool): + + if not all(isinstance(t, torch.Tensor) for t in state_steps): + raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") + + + func = _single_tensor_sophiag + + func(params, + grads, + exp_avgs, + hessian, + state_steps, + bs=bs, + beta1=beta1, + beta2=beta2, + rho=rho, + lr=lr, + weight_decay=weight_decay, + maximize=maximize, + capturable=capturable) + +def _single_tensor_sophiag(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + hessian: List[Tensor], + state_steps: List[Tensor], + *, + bs: int, + beta1: float, + beta2: float, + rho: float, + lr: float, + weight_decay: float, + maximize: bool, + capturable: bool): + + for i, param in enumerate(params): + grad = grads[i] if not maximize else -grads[i] + exp_avg = exp_avgs[i] + hess = hessian[i] + step_t = state_steps[i] + + if capturable: + assert param.is_cuda and step_t.is_cuda and bs.is_cuda + + if torch.is_complex(param): + grad = torch.view_as_real(grad) + exp_avg = torch.view_as_real(exp_avg) + hess = torch.view_as_real(hess) + param = torch.view_as_real(param) + + # update step + step_t += 1 + + # Perform stepweight decay + param.mul_(1 - lr * weight_decay) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + if capturable: + step_size = lr + step_size_neg = step_size.neg() + + ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1) + param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg) + else: + step_size_neg = - lr + + ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1) + param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg) diff --git a/megatron/timers.py b/megatron/timers.py index 53ff9be98e..5a9ba1f21d 100644 --- a/megatron/timers.py +++ b/megatron/timers.py @@ -8,8 +8,14 @@ import torch from deepspeed.accelerator import get_accelerator +from tensorboard.summary import Writer from packaging import version +try: + import wandb +except Exception: + wandb = None + class TimerBase(ABC): @@ -33,11 +39,10 @@ def elapsed(self, reset=True, barrier=False): pass - class DummyTimer(TimerBase): def __init__(self): - super().__init__('dummy timer') + super().__init__("dummy timer") def start(self, barrier=False): return @@ -49,9 +54,7 @@ def reset(self): return def elapsed(self, reset=True, barrier=False): - raise Exception('dummy timer should not be used to ' - 'calculate elapsed time') - + raise Exception("dummy timer should not be used to " "calculate elapsed time") class Timer(TimerBase): @@ -72,37 +75,32 @@ def __init__(self, name): self._barrier_group = None self._start_time = time.time() - def set_barrier_group(self, barrier_group): self._barrier_group = barrier_group - def start(self, barrier=False): """Start the timer.""" - assert not self._started, 'timer has already been started' + assert not self._started, "timer has already been started" if barrier: torch.distributed.barrier(group=self._barrier_group) get_accelerator().synchronize() self._start_time = time.time() self._started = True - def stop(self, barrier=False): """Stop the timer.""" - assert self._started, 'timer is not started' + assert self._started, "timer is not started" if barrier: torch.distributed.barrier(group=self._barrier_group) get_accelerator().synchronize() - self._elapsed += (time.time() - self._start_time) + self._elapsed += time.time() - self._start_time self._started = False - def reset(self): """Reset timer.""" self._elapsed = 0.0 self._started = False - def elapsed(self, reset=True, barrier=False): """Calculate the elapsed time.""" _started = self._started @@ -120,7 +118,6 @@ def elapsed(self, reset=True, barrier=False): return _elapsed - class Timers: """Group of timers.""" @@ -132,24 +129,27 @@ def __init__(self, log_level, log_option): self._dummy_timer = DummyTimer() self._max_log_level = 2 - def __call__(self, name, log_level=None): # If the timer has already been set, then check if the log-level # is provided, it matches the one that the timer was created with. if name in self._timers: if log_level is not None: - assert log_level == self._log_levels[name], \ - 'input log level {} does not match already existing '\ - 'log level {} for {} timer'.format( - log_level, self._log_levels[name], name) + assert log_level == self._log_levels[name], ( + "input log level {} does not match already existing " + "log level {} for {} timer".format( + log_level, self._log_levels[name], name + ) + ) return self._timers[name] # If timer does not exist and no log level is provided, # set it to the max log level which is 2. if log_level is None: log_level = self._max_log_level - assert log_level <= self._max_log_level, \ - 'log level {} is larger than max supported log level {}'.format( - log_level, self._max_log_level) + assert ( + log_level <= self._max_log_level + ), "log level {} is larger than max supported log level {}".format( + log_level, self._max_log_level + ) # Now if the input log level is larger than the one set for # the timers class, just ignore it and return a dummy timer. if log_level > self._log_level: @@ -159,7 +159,6 @@ def __call__(self, name, log_level=None): self._log_levels[name] = log_level return self._timers[name] - def _get_elapsed_time_all_ranks(self, names, reset, barrier): """ Assumptions: @@ -185,34 +184,35 @@ def _get_elapsed_time_all_ranks(self, names, reset, barrier): # pytorch yet. It is simpler to deal with a single tensor # and since we are only gathering a small amount of data, # it should be ok to use all-gather instead of gather. - rank_name_to_time = torch.zeros((world_size, len(names)), - dtype=torch.float, - device=get_accelerator().current_device_name()) + rank_name_to_time = torch.zeros( + (world_size, len(names)), + dtype=torch.float, + device=get_accelerator().current_device_name(), + ) for i, name in enumerate(names): if name in self._timers: # Here we don't need to pass the barrier flag as all # the processes are already in sync. This avoids the # issue of different timers having different barrier # groups inside their class. - rank_name_to_time[rank, i] = self._timers[name].elapsed( - reset=reset) + rank_name_to_time[rank, i] = self._timers[name].elapsed(reset=reset) # See the note above for why we are not using gather. - if version.parse(torch.__version__) >= version.parse('1.13'): - torch.distributed.all_gather_into_tensor(rank_name_to_time.view(-1), - rank_name_to_time[rank, :].view(-1)) + if version.parse(torch.__version__) >= version.parse("1.13"): + torch.distributed.all_gather_into_tensor( + rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1) + ) else: - torch.distributed._all_gather_base(rank_name_to_time.view(-1), - rank_name_to_time[rank, :].view(-1)) + torch.distributed._all_gather_base( + rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1) + ) return rank_name_to_time - def _get_global_min_max_time(self, names, reset, barrier, normalizer): """Report only min and max times across all ranks.""" - rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, - barrier) + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) name_to_min_max_time = {} for i, name in enumerate(names): rank_to_time = rank_name_to_time[:, i] @@ -222,34 +222,36 @@ def _get_global_min_max_time(self, names, reset, barrier, normalizer): if rank_to_time.numel() > 0: name_to_min_max_time[name] = ( rank_to_time.min().item() / normalizer, - rank_to_time.max().item() / normalizer) + rank_to_time.max().item() / normalizer, + ) return name_to_min_max_time - - def _get_global_min_max_time_string(self, names, reset, barrier, - normalizer, max_only): + def _get_global_min_max_time_string( + self, names, reset, barrier, normalizer, max_only + ): name_to_min_max_time = self._get_global_min_max_time( - names, reset, barrier, normalizer) + names, reset, barrier, normalizer + ) if not name_to_min_max_time: return None - output_string = '(min, max) time across ranks (ms):' + output_string = "(min, max) time across ranks (ms):" for name in name_to_min_max_time: min_time, max_time = name_to_min_max_time[name] if max_only: - output_string += '\n {}: {:.2f}'.format( - (name+' ').ljust(48, '.'), max_time) + output_string += "\n {}: {:.2f}".format( + (name + " ").ljust(48, "."), max_time + ) else: - output_string += '\n {}: ({:.2f}, {:.2f})'.format( - (name+' ').ljust(48, '.'), min_time, max_time) + output_string += "\n {}: ({:.2f}, {:.2f})".format( + (name + " ").ljust(48, "."), min_time, max_time + ) return output_string - def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): """Report times across all ranks.""" - rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, - barrier) + rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier) - output_string = 'times across ranks (ms):' + output_string = "times across ranks (ms):" no_reported_timing = True for i, name in enumerate(names): not_yet_found = True @@ -258,32 +260,32 @@ def _get_all_ranks_time_string(self, names, reset, barrier, normalizer): no_reported_timing = False if not_yet_found: not_yet_found = False - output_string += '\n {}:'.format(name) - output_string += '\n rank {:2d}: {:.2f}'.format( - rank, rank_name_to_time[rank, i] / normalizer) + output_string += "\n {}:".format(name) + output_string += "\n rank {:2d}: {:.2f}".format( + rank, rank_name_to_time[rank, i] / normalizer + ) if no_reported_timing: return None return output_string - def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): """Log a group of timers.""" # Print. assert normalizer > 0.0 - if self._log_option in ['max', 'minmax']: + if self._log_option in ["max", "minmax"]: max_only = False - if self._log_option == 'max': + if self._log_option == "max": max_only = True output_string = self._get_global_min_max_time_string( - names, reset, barrier, normalizer/1000.0, max_only) - elif self._log_option == 'all': - output_string = self._get_all_ranks_time_string(names, - reset, barrier, - normalizer/1000.0) + names, reset, barrier, normalizer / 1000.0, max_only + ) + elif self._log_option == "all": + output_string = self._get_all_ranks_time_string( + names, reset, barrier, normalizer / 1000.0 + ) else: - raise Exception('unknown timing log option {}'.format( - self._log_option)) + raise Exception("unknown timing log option {}".format(self._log_option)) # If no input rank is provided, log on last rank. if rank is None: @@ -291,9 +293,15 @@ def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): if rank == torch.distributed.get_rank() and output_string is not None: print(output_string, flush=True) - - def write(self, names, writer, iteration, normalizer=1.0, - reset=False, barrier=False): + def write( + self, + names: list[str], + writer: Writer, + iteration: int, + normalizer: float = 1.0, + reset: bool = False, + barrier: bool = False, + ): """Write timers to a tensorboard writer Note that we only report maximum time across ranks to tensorboard. """ @@ -302,8 +310,22 @@ def write(self, names, writer, iteration, normalizer=1.0, # polutes the runs list, so we just add each as a scalar assert normalizer > 0.0 name_to_min_max_time = self._get_global_min_max_time( - names, reset, barrier, normalizer) - if writer.is_enabled(): + names, reset, barrier, normalizer + ) + # <<<<<<< HEAD + timer_data = { + "timers/iteration": iteration, + **{ + f"timers/{k}-time": name_to_min_max_time[k][1] + for k in name_to_min_max_time + }, + } + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb.log(timer_data, commit=False) + # ======= + # if writer.is_enabled(): + # >>>>>>> 0d6e3793a1fc06eded9764ef15ad12bcc0281101 + if writer is not None: # and writer.is_enabled(): for name in name_to_min_max_time: _, max_time = name_to_min_max_time[name] - writer.add_scalar(name + '-time', max_time, iteration) + writer.add_scalar(f"{name}-time", max_time, iteration) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 7edef90b6f..92853fb30d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -9,6 +9,8 @@ from transformers import AutoTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer + + def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: @@ -36,18 +38,22 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPTSentencePieceTokenizer': assert args.tokenizer_model is not None tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'Llama2Tokenizer': + assert args.tokenizer_model is not None + tokenizer = _Llama2Tokenizer(args.tokenizer_model) elif args.tokenizer_type == 'NullTokenizer': assert args.vocab_size is not None tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == 'HFTokenizer': assert args.tokenizer_model is not None + # tokenizer = _HFTokenizer(args.tokenizer_model) tokenizer = _HFTokenizer(args.tokenizer_model, args.seq_length, args.trust_remote_code) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) - + # Add vocab size. args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) @@ -507,6 +513,56 @@ def eod(self): def additional_special_tokens_ids(self): return None + +class _Llama2Tokenizer(_SentencePieceTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file,): + super().__init__(model_file, vocab_extra_ids=0) + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + + # BOS / EOS token IDs + self.n_words: int = self.tokenizer.vocab_size() + self.bos_id: int = self.tokenizer.bos_id() + self.eos_id: int = self.tokenizer.eos_id() + self.pad_id: int = self.tokenizer.pad_id() + assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size() + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + assert type(s) is str + t = self.tokenizer.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def detokenize(self, ids): + return self.tokenizer.decode_ids(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self.eos_id + + @property + def additional_special_tokens_ids(self): + return None + class _NullTokenizer: def __init__(self, vocab_size): vocab_size = int(vocab_size) @@ -550,7 +606,6 @@ def __init__(self, tokenizer_name_or_path, max_seq_len, trust_remote_code): padding_side="right", trust_remote_code=trust_remote_code, use_fast=False) - DEFAULT_PAD_TOKEN = "[PAD]" DEFAULT_EOS_TOKEN = "" DEFAULT_BOS_TOKEN = "" diff --git a/megatron/training.py b/megatron/training.py index 0aeaabeba5..d39f21c128 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1,101 +1,134 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - """Pretrain utilities.""" +import time + +# The earliest we can measure the start time. +_TRAIN_START_TIME = time.time() +from collections import OrderedDict from datetime import datetime +import json +import logging import math +import os import sys import time -import json -try: - import wandb -except (ImportError, ModuleNotFoundError): - wandb = None -# The earliest we can measure the start time. -_TRAIN_START_TIME = time.time() + +import deepspeed +from deepspeed.accelerator import get_accelerator +from deepspeed.compression.compress import init_compression, redundancy_clean +from deepspeed.runtime.data_pipeline.data_routing.helper import ( + convert_to_random_ltd, +) +import ezpz as ez import torch -from collections import OrderedDict +import torch.distributed as tdist from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP -from enum import Enum - -from megatron import get_args -from megatron import get_signal_handler -from megatron import get_timers -from megatron import get_tensorboard_writer -from megatron import get_wandb_writer -from megatron import get_current_global_batch_size -from megatron import get_num_microbatches -from megatron import is_last_rank -from megatron import update_num_microbatches + +import wandb +from megatron import ( + get_args, + get_current_global_batch_size, + get_num_microbatches, + get_signal_handler, + get_tensorboard_writer, + get_timers, + is_last_rank, + update_num_microbatches, +) +from megatron.arguments import core_transformer_config_from_args +from megatron.checkpointing import load_checkpoint, save_checkpoint from megatron.core import mpu, tensor_parallel -from megatron import print_rank_0, is_rank_0 -from megatron import print_rank_last -from megatron.checkpointing import load_checkpoint -from megatron.checkpointing import save_checkpoint -from megatron.model import Float16Module -from megatron.model import GPTModel from megatron.core.enums import ModelType -from megatron.optimizer import get_megatron_optimizer -from megatron.initialize import initialize_megatron -from megatron.initialize import write_args_to_tensorboard -from megatron.initialize import set_jit_fusion_options -from megatron.optimizer_param_scheduler import OptimizerParamScheduler -from megatron.model import DistributedDataParallel as LocalDDP -from megatron.utils import check_adlr_autoresume_termination -from megatron.utils import unwrap_model, found_kill_switch -from megatron.data.data_samplers import build_pretraining_data_loader -from megatron.utils import calc_params_l2_norm from megatron.core.pipeline_parallel import get_forward_backward_func -from megatron.utils import report_memory, throughput_calculator, checkpoint_throughput_calculator, update_rotary_pos_emb +from megatron.data.data_samplers import build_pretraining_data_loader +from megatron.initialize import ( + initialize_megatron, + set_jit_fusion_options, + write_args_to_tensorboard, +) +from megatron.model import Float16Module, GPTModel +from megatron.model import DistributedDataParallel as LocalDDP +from megatron.model.transformer import ParallelTransformerLayer from megatron.model.vision.knn_monitor import compute_feature_bank -from megatron.arguments import core_transformer_config_from_args -from megatron.profiler import setup_profiler, trigger, on_step_begin, on_step_end +from megatron.optimizer import get_megatron_optimizer +from megatron.optimizer_param_scheduler import OptimizerParamScheduler +from megatron.training_log import training_log +from megatron.utils import ( + PerfTrace, + Profile, + calc_params_l2_norm, + check_adlr_autoresume_termination, + checkpoint_throughput_calculator, + found_kill_switch, + unwrap_model, + update_rotary_pos_emb, +) -import deepspeed -from deepspeed.accelerator import get_accelerator -from deepspeed.compression.compress import init_compression, redundancy_clean -from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd -from megatron.model.transformer import ParallelTransformerLayer +from megatron.profiler import ( + setup_profiler, + trigger, + on_step_begin, + on_step_end, +) + + +dlp = Profile("TRAINING") + +# from deepspeed import comm as dist + +RANK: int = ez.get_rank() +WORLD_SIZE: int = ez.get_world_size() +# DEVICE_TYPE: str = ez.get_torch_device() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE: torch.device = torch.device(DEVICE_TYPE) + +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") -from deepspeed import comm as dist def print_datetime(string): """Note that this call will sync across all ranks.""" - torch.distributed.barrier() - time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + tdist.barrier() + time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + log.info("[" + string + "] datetime={} ".format(time_str)) + -''' +""" Since v0.9.0, deepspeed.initialize() has forbidden simultaneous setting of args.deepspeed_config (Path) and ds_config dict. -So, we use ds_config dict which is the more flexible option. -''' +So, we use ds_config dict which is the more flexible option +""" + + def _create_ds_config_dict(): args = get_args() - if isinstance(args.deepspeed_config, dict) : + assert args is not None + if isinstance(args.deepspeed_config, dict): ds_config_dict = args.deepspeed_config else: - with open(args.deepspeed_config, 'r', encoding='utf-8') as config_file: + with open(args.deepspeed_config, "r", encoding="utf-8") as config_file: ds_config_dict = json.load(config_file) - if args.universal_checkpoint: ds_config_dict["checkpoint"] = {"load_universal": True} - # Clear config path - args.deepspeed_config = None - + args.deepspeed_config = None return ds_config_dict - - -def pretrain(train_valid_test_dataset_provider, - model_provider, - model_type, - forward_step_func, - process_non_loss_data_func=None, - extra_args_provider=None, - args_defaults={}, - data_post_process=None, - external_args={}): + + +@dlp.log +def pretrain( + train_valid_test_dataset_provider, + model_provider, + model_type, + forward_step_func, + process_non_loss_data_func=None, + extra_args_provider=None, + args_defaults={}, + data_post_process=None, + external_args={}, +) -> list[torch.nn.Module]: """Main training program. This function will run the followings in the order provided: @@ -123,78 +156,112 @@ def pretrain(train_valid_test_dataset_provider, to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. - """ + Returns: + model (torch.nn.Module) + """ # Initalize and get arguments, timers, and Tensorboard writer. - initialize_megatron(extra_args_provider=extra_args_provider, - args_defaults=args_defaults, external_args=external_args) - + initialize_megatron( + extra_args_provider=extra_args_provider, + args_defaults=args_defaults, + external_args=external_args, + ) args = get_args() - + assert args is not None if found_kill_switch(): print_datetime(f"Detected kill switch at {args.kill_switch_file}. Exiting") sys.exit() # Set pytorch JIT layer fusion options and warmup JIT functions. - if get_accelerator().device_name() == 'cuda': + # if get_accelerator().device_name() == "cuda": + if DEVICE_TYPE == "cuda" and torch.cuda.is_available(): set_jit_fusion_options() # Adjust the startup time so it reflects the largest value. # This will be closer to what scheduler will see (outside of # image ... launches. + before_allreduce = time.time() global _TRAIN_START_TIME - start_time_tensor = get_accelerator().DoubleTensor([_TRAIN_START_TIME]) - torch.distributed.all_reduce(start_time_tensor, - op=torch.distributed.ReduceOp.MIN) + log.info( + f"time to finish initialize_megatron: {time.time() - _TRAIN_START_TIME} seconds" + ) + # start_time_tensor = DEVICE.DoubleTensor([_TRAIN_START_TIME]) + start_time_tensor = torch.tensor( + [_TRAIN_START_TIME], dtype=torch.double, device=DEVICE_TYPE + ) + tdist.all_reduce(start_time_tensor, op=tdist.ReduceOp.MIN) + log.info(f"allreduce call time: {time.time()-before_allreduce} seconds") _TRAIN_START_TIME = start_time_tensor.item() - print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( - time.time() - _TRAIN_START_TIME)) - print_datetime('after megatron is initialized') - + log.info( + "time to initialize megatron (seconds)={:.3f}".format( + time.time() - _TRAIN_START_TIME + ) + ) + print_datetime("after megatron is initialized") + if os.getenv("DLIO_PROFILER_DATASET_DIR") is not None: + extra_trace_path = os.environ["DLIO_PROFILER_DATASET_DIR"] + else: + extra_trace_path = "" + os.makedirs(args.trace_dir, exist_ok=True) + PerfTrace.initialize_log( + f"{args.trace_dir}/trace-{ez.get_rank()}-of-{ez.get_world_size()}.pfw", + f"{args.data_cache_path}:{extra_trace_path}:{args.data_path}:{args.save}:{args.load}", + process_id=ez.get_rank(), + ) timers = get_timers() - + assert args is not None + assert timers is not None if args.deepspeed: args.deepspeed_config_dict = _create_ds_config_dict() - if "curriculum_learning" in args.deepspeed_config_dict and \ - "enabled" in args.deepspeed_config_dict["curriculum_learning"]: - args.curriculum_learning_legacy = args.deepspeed_config_dict[ \ - "curriculum_learning"]["enabled"] + if ( + "curriculum_learning" in args.deepspeed_config_dict + and "enabled" in args.deepspeed_config_dict["curriculum_learning"] + ): + args.curriculum_learning_legacy = args.deepspeed_config_dict[ + "curriculum_learning" + ]["enabled"] if args.curriculum_learning_legacy and not args.no_pipeline_parallel: - from deepspeed.runtime.data_pipeline.curriculum_scheduler \ - import CurriculumScheduler - args.curriculum_scheduler = CurriculumScheduler( \ - args.deepspeed_config_dict["curriculum_learning"]) + from deepspeed.runtime.data_pipeline.curriculum_scheduler import ( + CurriculumScheduler, + ) + + args.curriculum_scheduler = CurriculumScheduler( + args.deepspeed_config_dict["curriculum_learning"] + ) if "compression_training" in args.deepspeed_config_dict: args.compression_training = True # Model, optimizer, and learning rate. - timers('model-and-optimizer-setup', log_level=0).start(barrier=True) + timers("model-and-optimizer-setup", log_level=0).start(barrier=True) model, optimizer, opt_param_scheduler = setup_model_and_optimizer( - model_provider, model_type, teacher=False, data_post_process=data_post_process, - build_train_valid_test_datasets_provider=train_valid_test_dataset_provider) - timers('model-and-optimizer-setup').stop() - print_datetime('after model, optimizer, and learning rate ' - 'scheduler are built') - + model_provider, + model_type, + teacher=False, + data_post_process=data_post_process, + build_train_valid_test_datasets_provider=train_valid_test_dataset_provider, + ) + timers("model-and-optimizer-setup").stop() + print_datetime("after model, optimizer, and learning rate " "scheduler are built") # Data stuff. - timers('train/valid/test-data-iterators-setup', log_level=0).start( - barrier=True) + timers("train/valid/test-data-iterators-setup", log_level=0).start(barrier=True) if args.virtual_pipeline_model_parallel_size is not None: all_data_iterators = [ - build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + build_train_valid_test_data_iterators(train_valid_test_dataset_provider) for _ in range(len(model)) ] - train_data_iterator = [data_iterators[0] - for data_iterators in all_data_iterators] - valid_data_iterator = [data_iterators[1] - for data_iterators in all_data_iterators] - test_data_iterator = [data_iterators[2] - for data_iterators in all_data_iterators] + train_data_iterator = [ + data_iterators[0] for data_iterators in all_data_iterators + ] + valid_data_iterator = [ + data_iterators[1] for data_iterators in all_data_iterators + ] + test_data_iterator = [ + data_iterators[2] for data_iterators in all_data_iterators + ] else: - train_data_iterator, valid_data_iterator, test_data_iterator \ - = build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + train_data_iterator, valid_data_iterator, test_data_iterator = ( + build_train_valid_test_data_iterators(train_valid_test_dataset_provider) + ) if args.data_efficiency_curriculum_learning: if args.deepspeed_dataloader is not None: # We use args to pass the deepspeed_dataloader because adding @@ -207,67 +274,79 @@ def pretrain(train_valid_test_dataset_provider, args.deepspeed_dataloader = None else: train_data_iterator = None - timers('train/valid/test-data-iterators-setup').stop() - print_datetime('after dataloaders are built') - + timers("train/valid/test-data-iterators-setup").stop() + print_datetime("after dataloaders are built") # args.teacher_model is used as global variable to pass the teacher model # for knowledge distillation. Users do not need to set it in the command # line to use kd, but users do need to provide teacher model configurations # like args.num_layers_teacher as described in setup_teacher_model() args.teacher_model = None - if args.mos or args.kd: # Set up teacher model + if args.mos or args.kd: # Set up teacher model args.teacher_model = setup_teacher_model(args, model_provider) - # Print setup timing. - print_rank_0('done with setup ...') - timers.log(['model-and-optimizer-setup', - 'train/valid/test-data-iterators-setup'], barrier=True) - + log.info("done with setup ...") + timers.log( + ["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], + barrier=True, + ) if not args.skip_train: - print_rank_0('training ...') - - if args.dataloader_type == 'cyclic' and args.retro_add_retriever: + log.info("training ...") + if args.dataloader_type == "cyclic" and args.retro_add_retriever: args.train_iters = args.retro_cyclic_train_iters - print_rank_0("retro cyclic train iters : %d" % args.train_iters) - + log.info("retro cyclic train iters : %d" % args.train_iters) iteration = 0 if args.do_train and args.train_iters > 0: - iteration = train(forward_step_func, - model, optimizer, opt_param_scheduler, - train_data_iterator, valid_data_iterator, - process_non_loss_data_func) - - print_datetime('after training is done') + iteration = train( + forward_step_func, + model, + optimizer, + opt_param_scheduler, + train_data_iterator, + valid_data_iterator, + process_non_loss_data_func, + ) + print_datetime("after training is done") # Clean the model if args.compression_training: model = [redundancy_clean(model[0], args.deepspeed_config_dict, mpu)] - if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) else: - print_rank_0('skipping training (--skip-train is on) ...') - + log.info("skipping training (--skip-train is on) ...") iteration = args.iteration - config = core_transformer_config_from_args(args) if args.do_valid: - prefix = f'iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from validation set' - evaluate_and_print_results(prefix, forward_step_func, - valid_data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train) - + prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from validation set" + _ = evaluate_and_print_results( + prefix, + forward_step_func, + valid_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=True, + write_to_tensorboard=not args.skip_train, + ) if args.do_test: - prefix = f'iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from test set' - evaluate_and_print_results(prefix, forward_step_func, - test_data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train, test=True) + prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from test set" + _ = evaluate_and_print_results( + prefix, + forward_step_func, + test_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=True, + write_to_tensorboard=not args.skip_train, + test=True, + ) return model +@dlp.log def update_train_iters(args): - # For iteration-based training, we don't need to do anything if args.train_iters: return @@ -289,16 +368,15 @@ def update_train_iters(args): update_num_microbatches(0, consistency_check=False) # Constant phase # Note that we throw away any partial last batch. - iterations += (args.train_samples - consumed_samples) // \ - args.global_batch_size + iterations += (args.train_samples - consumed_samples) // args.global_batch_size args.train_iters = iterations - print_rank_0('setting training iterations to {}'.format(args.train_iters)) + log.info("setting training iterations to {}".format(args.train_iters)) -def setup_teacher_model(args, model_provider): - - print_rank_0('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) +@dlp.log +def setup_teacher_model(args, model_provider): + log.info("***>>>>> Student model checkpoint iteration:{}".format(args.iteration)) iteration_stuent = args.iteration num_layers_student = args.num_layers num_experts_student = args.num_experts @@ -306,7 +384,7 @@ def setup_teacher_model(args, model_provider): num_attention_heads_student = args.num_attention_heads load_student = args.load - print_rank_0('***>>>>> Setting up the teacher model') + log.info("***>>>>> Setting up the teacher model") args.num_layers = args.num_layers_teacher args.num_experts = args.num_experts_teacher @@ -314,7 +392,7 @@ def setup_teacher_model(args, model_provider): args.num_attention_heads = args.num_attention_heads_teacher args.load = args.load_teacher teacher_model, _, _ = load_model_weights_only(model_provider) - print_rank_0('***>>>>> Teacher model:{}'.format(teacher_model)) + log.info("***>>>>> Teacher model:{}".format(teacher_model)) args.num_layers = num_layers_student args.num_experts = num_experts_student @@ -325,16 +403,27 @@ def setup_teacher_model(args, model_provider): return teacher_model -def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): + +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def get_model( + model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True +): """Build the model.""" args = get_args() + accelerator = get_accelerator() + assert accelerator is not None + assert args is not None args.model_type = model_type # Build model. - if mpu.get_pipeline_model_parallel_world_size() > 1 and \ - args.virtual_pipeline_model_parallel_size is not None: - assert model_type != ModelType.encoder_and_decoder, \ - "Interleaved schedule not supported for model with both encoder and decoder" + if ( + mpu.get_pipeline_model_parallel_world_size() > 1 + and args.virtual_pipeline_model_parallel_size is not None + ): + assert ( + model_type != ModelType.encoder_and_decoder + ), "Interleaved schedule not supported for model with both encoder and decoder" model = [] for i in range(args.virtual_pipeline_model_parallel_size): mpu.set_virtual_pipeline_model_parallel_rank(i) @@ -342,8 +431,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() this_model = model_provider_func( - pre_process=pre_process, - post_process=post_process + pre_process=pre_process, post_process=post_process ) this_model.model_type = model_type model.append(this_model) @@ -354,37 +442,37 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap add_decoder = True if model_type == ModelType.encoder_and_decoder: if mpu.get_pipeline_model_parallel_world_size() > 1: - assert args.pipeline_model_parallel_split_rank is not None, \ - "Split rank needs to be specified for model with both encoder and decoder" + assert ( + args.pipeline_model_parallel_split_rank is not None + ), "Split rank needs to be specified for model with both encoder and decoder" rank = mpu.get_pipeline_model_parallel_rank() split_rank = args.pipeline_model_parallel_split_rank world_size = mpu.get_pipeline_model_parallel_world_size() pre_process = rank == 0 or rank == split_rank - post_process = (rank == (split_rank - 1)) or ( - rank == (world_size - 1)) + post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1)) add_encoder = mpu.is_pipeline_stage_before_split() add_decoder = mpu.is_pipeline_stage_after_split() model = model_provider_func( pre_process=pre_process, post_process=post_process, add_encoder=add_encoder, - add_decoder=add_decoder) + add_decoder=add_decoder, + ) else: model = model_provider_func( - pre_process=pre_process, - post_process=post_process + pre_process=pre_process, post_process=post_process ) model.model_type = model_type - if not isinstance(model, list): model = [model] # Disallow training and inference with Transformer Engine # for non-GPT models args.allow_transformer_engine = all([type(m) == GPTModel for m in model]) - assert args.allow_transformer_engine or args.transformer_impl == 'local', \ - 'Transformer Engine is only approved for GPT models' + assert ( + args.allow_transformer_engine or args.transformer_impl == "local" + ), "Transformer Engine is only approved for GPT models" # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these @@ -392,56 +480,84 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap # are set for all params so the optimizer can use them. for model_module in model: for param in model_module.parameters(): - tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param) + tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes( + param + ) # Print number of parameters. if mpu.get_data_parallel_rank() == 0: - print(' > number of parameters on (tensor, pipeline) ' - 'model parallel rank ({}, {}): {}'.format( - mpu.get_tensor_model_parallel_rank(), - mpu.get_pipeline_model_parallel_rank(), - sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) - for model_module in model])), flush=True) + print( + " > number of parameters on (tensor, pipeline) " + "model parallel rank ({}, {})={}".format( + mpu.get_tensor_model_parallel_rank(), + mpu.get_pipeline_model_parallel_rank(), + sum( + [ + sum( + [ + p.ds_numel if hasattr(p, "ds_id") else p.nelement() + for p in model_module.parameters() + ] + ) + for model_module in model + ] + ), + ), + flush=True, + ) if args.deepspeed: return model # GPU allocation. for model_module in model: - model_module.to(get_accelerator().current_device_name()) - + model_module.to(DEVICE_TYPE) # Fp16 conversion. if args.fp16 or args.bf16: model = [Float16Module(model_module, args) for model_module in model] if wrap_with_ddp: - if args.DDP_impl == 'torch': - i = get_accelerator().current_device() - model = [torchDDP(model_module, device_ids=[i], output_device=i, - process_group=mpu.get_data_parallel_group()) - for model_module in model] - - elif args.DDP_impl == 'local': - model = [LocalDDP(model_module, - args.accumulate_allreduce_grads_in_fp32, - args.use_contiguous_buffers_in_local_ddp) - for model_module in model] + if args.DDP_impl == "torch": + i = accelerator.current_device() + model = [ + torchDDP( + model_module, + device_ids=[i], + output_device=i, + process_group=mpu.get_data_parallel_group(), + ) + for model_module in model + ] + + elif args.DDP_impl == "local": + model = [ + LocalDDP( + model_module, + args.accumulate_allreduce_grads_in_fp32, + args.use_contiguous_buffers_in_local_ddp, + ) + for model_module in model + ] # broad cast params from data parallel src rank to other data parallel ranks if args.data_parallel_random_init: for model_module in model: model_module.broadcast_params() else: - raise NotImplementedError('Unknown DDP implementation specified: ' - '{}. Exiting.'.format(args.DDP_impl)) + raise NotImplementedError( + "Unknown DDP implementation specified: " + "{}. Exiting.".format(args.DDP_impl) + ) return model +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def get_optimizer_param_scheduler(optimizer): """Build the learning rate scheduler.""" args = get_args() - + assert args is not None # Iteration-based training. if args.train_iters: if args.lr_decay_iters is None: @@ -467,8 +583,7 @@ def get_optimizer_param_scheduler(optimizer): else: lr_warmup_steps = args.lr_warmup_samples else: - raise Exception( - 'either train-iters or train-samples should be provided.') + raise Exception("either train-iters or train-samples should be provided.") opt_param_scheduler = OptimizerParamScheduler( optimizer, @@ -482,74 +597,75 @@ def get_optimizer_param_scheduler(optimizer): wd_incr_steps=wd_incr_steps, wd_incr_style=args.weight_decay_incr_style, use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler, - override_opt_param_scheduler=args.override_opt_param_scheduler) + override_opt_param_scheduler=args.override_opt_param_scheduler, + ) return opt_param_scheduler + +@dlp.log def load_model_weights_only(model_provider_func): """Setup model and optimizer.""" args = get_args() - print_rank_0('***>>>>> Args:{}'.format(args)) - + assert args is not None + log.info("***>>>>> Args:{}".format(args)) model = get_model(model_provider_func) - optimizer = None lr_scheduler = None - if args.deepspeed: # When loading just the model weights, ZeRO can be disabled. - if 'zero_optimization' in args.deepspeed_config_dict: - del args.deepspeed_config_dict['zero_optimization'] + if "zero_optimization" in args.deepspeed_config_dict: + del args.deepspeed_config_dict["zero_optimization"] model, optimizer, _, lr_scheduler = deepspeed.initialize( - model=model[0], - config=args.deepspeed_config_dict + model=model[0], config=args.deepspeed_config_dict ) - assert not isinstance(model, deepspeed.PipelineEngine), \ - 'Weight loading only mode is not supported in pipeline parallelism yet.' - + assert not isinstance(model, deepspeed.PipelineEngine), ( + "Weight loading only mode is not supported in " "pipeline parallelism yet." + ) model = [model] - - print_datetime('before load checkpoint') + print_datetime("before load checkpoint") if args.load is not None: - iteration = load_checkpoint(model, optimizer, lr_scheduler, strict=True, load_only_weights=True) - - print_datetime('after load checkpoint weights') - + _ = load_checkpoint( + model, optimizer, lr_scheduler, strict=True, load_only_weights=True + ) + print_datetime("after load checkpoint weights") return model, optimizer, lr_scheduler -def setup_model_and_optimizer(model_provider_func, - model_type, - no_wd_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0, - teacher=False, - data_post_process=None, - build_train_valid_test_datasets_provider=None): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def setup_model_and_optimizer( + model_provider_func, + model_type, + no_wd_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0, + teacher=False, + data_post_process=None, + build_train_valid_test_datasets_provider=None, +): """Setup model and optimizer.""" args = get_args() - + assert args is not None model = get_model(model_provider_func, model_type) - # initialize the compression here student_global_steps = 0 if args.kd or args.mos: model, _, _, _ = deepspeed.initialize( - model=model[0], - args=args, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, - ) + model=model[0], + args=args, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) model = [model] if args.load is not None: args.iteration = load_checkpoint(model, None, None, strict=False) else: args.iteration = 0 student_global_steps = model[0].global_steps - print_rank_0('***>>>>> Student model, global step:{}'.format(student_global_steps)) - + log.info("***>>>>> Student model, global step:{}".format(student_global_steps)) if args.compression_training: model, _, _, _ = deepspeed.initialize( model=model[0], @@ -559,10 +675,7 @@ def setup_model_and_optimizer(model_provider_func, ) model = [model] model = [init_compression(model[0].module, args.deepspeed_config_dict, mpu)] - - unwrapped_model = unwrap_model(model, - (torchDDP, LocalDDP, Float16Module)) - + unwrapped_model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) if args.inference: optimizer = None opt_param_scheduler = None @@ -570,113 +683,156 @@ def setup_model_and_optimizer(model_provider_func, if teacher: optimizer = None else: - optimizer = get_megatron_optimizer(model, no_wd_decay_cond, - scale_lr_cond, lr_mult) + optimizer = get_megatron_optimizer( + model, no_wd_decay_cond, scale_lr_cond, lr_mult + ) # opt_param_scheduler is the old lr_scheduler plus weight decay scheduling opt_param_scheduler = get_optimizer_param_scheduler(optimizer) - if args.deepspeed: - print_rank_0("DeepSpeed is enabled.") - pp = mpu.get_pipeline_model_parallel_world_size() - if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None: + log.info("DeepSpeed is enabled.") + # pp = mpu.get_pipeline_model_parallel_world_size() + if ( + args.data_efficiency_curriculum_learning + and build_train_valid_test_datasets_provider is not None + ): + log.info( + "Caught 'args.data_efficiency_curriculum_learning' " + "and 'build_train_valid_test_datasets_provider is not None'" + ) train_ds = None # Only need to build dataset on tp rank 0 since Megatron has the # broadcast_data() function that broadcast data from tp rank 0. if mpu.get_tensor_model_parallel_rank() == 0: + log.info("Caught 'mpu.get_tensor_model_parallel_rank() == 0'") # Number of train/valid/test samples. if args.train_samples: train_samples = args.train_samples update_train_iters(args) else: train_samples = args.train_iters * args.global_batch_size + log.info(f"{train_samples=}") # eval_iters and test_iters here are not actually used, only for # satisfying the input of build_train_valid_test_datasets_provider. # We only need to build the training data here. And we follow # baseline's logic to build eval/test dataset later in # build_train_valid_test_data_iterators. - eval_iters = (args.train_iters // args.eval_interval + 1) * \ - args.eval_iters + eval_iters = ( + args.train_iters // args.eval_interval + 1 + ) * args.eval_iters test_iters = args.eval_iters - train_val_test_num_samples = [train_samples, - eval_iters * args.global_batch_size, - test_iters * args.global_batch_size] + train_val_test_num_samples = [ + train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size, + ] + log.info(f"{train_val_test_num_samples=}") # Build the datasets. train_ds, _, _ = build_train_valid_test_datasets_provider( - train_val_test_num_samples) - model, optimizer, args.deepspeed_dataloader, opt_param_scheduler = deepspeed.initialize( - model=model[0], - optimizer=optimizer, - args=args, - lr_scheduler=opt_param_scheduler, - training_data=train_ds, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, - ) + train_val_test_num_samples + ) + with Profile("deepspeed.initialize"): + model, optimizer, args.deepspeed_dataloader, opt_param_scheduler = ( + deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=opt_param_scheduler, + training_data=train_ds, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) + ) model.set_data_post_process_func(data_post_process) else: - model, optimizer, _, opt_param_scheduler = deepspeed.initialize( - model=model[0], - optimizer=optimizer, - args=args, - lr_scheduler=opt_param_scheduler, - mpu=mpu if args.no_pipeline_parallel else None, - config=args.deepspeed_config_dict, + log.info( + "Did NOT catch: ('args.data_efficiency_curriculum_learning' " + "and 'build_train_valid_test_datasets_provider is not None')" ) + tds0 = time.time() + if os.environ.get("PYINSTRUMENT_PROFILER", None): + profiler = ez.profile.get_context_manager(rank=RANK, outdir=args.save) + else: + profiler = Profile("deepspeed.initialize") + log.info("Calling 'deepspeed.initialize'...") + log.info(f"Wrapped with: {profiler=}") + with profiler: + model, optimizer, _, opt_param_scheduler = deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=opt_param_scheduler, + mpu=mpu if args.no_pipeline_parallel else None, + config=args.deepspeed_config_dict, + ) + log.info(f"'deepspeed.initialize' took: {time.time() - tds0:.5f}s") if isinstance(model, deepspeed.PipelineEngine): # hack to get batch_fn from pretrain_gpt.py model.set_batch_fn(model.module._megatron_batch_fn) - - assert model.grid.get_pipe_parallel_rank() == mpu.get_pipeline_model_parallel_rank() - assert model.grid.get_slice_parallel_rank() == mpu.get_tensor_model_parallel_rank() + assert ( + model.grid.get_pipe_parallel_rank() + == mpu.get_pipeline_model_parallel_rank() + ) + assert ( + model.grid.get_slice_parallel_rank() + == mpu.get_tensor_model_parallel_rank() + ) assert model.grid.get_data_parallel_rank() == mpu.get_data_parallel_rank() model = [model] - - # Compression has its own checkpoint loading path (e.g, loading both teacher and student models). So if compression is enabled, we skip the following checkpoint loading. + # Compression has its own checkpoint loading path (e.g, loading both teacher + # and student models). So if compression is enabled, we skip the following + # checkpoint loading. no_post_init_checkpoint_loading = args.kd or args.mos if not no_post_init_checkpoint_loading: if args.load is not None: timers = get_timers() - timers('load-checkpoint', log_level=0).start(barrier=True) + assert timers is not None + timers("load-checkpoint", log_level=0).start(barrier=True) args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler) - timers('load-checkpoint').stop(barrier=True) - timers.log(['load-checkpoint']) + timers("load-checkpoint").stop(barrier=True) + timers.log(["load-checkpoint"]) else: args.iteration = 0 else: model[0].global_steps = student_global_steps - # We only support local DDP with multiple micro-batches. if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1: - assert args.DDP_impl == 'local' - + assert args.DDP_impl == "local" # get model without FP16 and/or TorchDDP wrappers - if args.iteration == 0 and len(unwrapped_model) == 1 \ - and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): - print_rank_0("Initializing ICT from pretrained BERT model") + if ( + args.iteration == 0 + and len(unwrapped_model) == 1 + and hasattr(unwrapped_model[0], "init_state_dict_from_bert") + ): + log.info("Initializing ICT from pretrained BERT model") unwrapped_model[0].init_state_dict_from_bert() if args.fp16: + assert optimizer is not None optimizer.reload_model_params() - # random-LTD requires converting transformer layers if args.random_ltd: model[0] = convert_to_random_ltd(model[0], ParallelTransformerLayer) - return model, optimizer, opt_param_scheduler - -def train_step(forward_step_func, data_iterator, - model, optimizer, opt_param_scheduler, config): +@dlp.log +def train_step( + forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config +): """Single training step.""" args = get_args() timers = get_timers() - + accelerator = get_accelerator() + assert args is not None and timers is not None and accelerator is not None + grad_norm = None + num_zeros_in_grad = None if args.deepspeed and args.ds_pipeline_enabled: num_zeros_in_grad = 0 assert isinstance(model[0], deepspeed.PipelineEngine) loss = model[0].train_batch(data_iter=data_iterator) additional_losses = model[0].get_additional_losses() - loss_key = 'lm loss' if additional_losses is None else 'loss' # use "lm loss" for backward compatibility + loss_key = ( + "lm loss" if additional_losses is None else "loss" + ) # use "lm loss" for backward compatibility loss_dict = OrderedDict({loss_key: loss}) if additional_losses is not None: loss_dict.update(additional_losses) @@ -687,14 +843,13 @@ def train_step(forward_step_func, data_iterator, # Set grad to zero. if not args.deepspeed: - if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_local_ddp: + if args.DDP_impl == "local" and args.use_contiguous_buffers_in_local_ddp: for partition in model: partition.zero_grad_buffer() optimizer.zero_grad() # Forward pass. - timers('forward-backward', log_level=1).start( - barrier=args.barrier_with_L1_time) + timers("forward-backward", log_level=1).start(barrier=args.barrier_with_L1_time) forward_backward_func = get_forward_backward_func() if args.mos or args.kd: # args.teacher_forward is used as global variable to enable kd loss @@ -706,26 +861,29 @@ def train_step(forward_step_func, data_iterator, if args.timing_log_level < 2: config.timers = None + num_microbatches = get_num_microbatches() + assert num_microbatches is not None losses_reduced = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, - forward_only=False) + forward_only=False, + ) # reset timers if necessary if config.timers is None: config.timers = timers - timers('forward-backward').stop() + timers("forward-backward").stop() if args.mos or args.kd: args.teacher_forward = False # Empty unused memory. - if args.empty_unused_memory_level >= 1: - get_accelerator().empty_cache() + if args.empty_unused_memory_level >= 1 and accelerator is not None: + accelerator.empty_cache() # Reduce gradients. if not args.deepspeed: @@ -733,21 +891,23 @@ def train_step(forward_step_func, data_iterator, # Vision gradients. if args.vision_pretraining and args.vision_pretraining_type == "dino": - unwrapped_model = unwrap_model(model[0], - (torchDDP, LocalDDP, Float16Module)) + unwrapped_model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) unwrapped_model.cancel_gradients_last_layer(args.curr_iteration) # Update parameters. - timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time) + timers("optimizer", log_level=1).start(barrier=args.barrier_with_L1_time) if args.deepspeed: - increment = get_num_microbatches() * \ - args.micro_batch_size * \ - args.data_parallel_size - model[0].step(lr_kwargs={'increment': increment}) - update_successful = model[0].was_step_applied() + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + try: + model[0].step(lr_kwargs={"increment": increment}) + update_successful = model[0].was_step_applied() + except Exception: + update_successful = False else: update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers) - timers('optimizer').stop() + timers("optimizer").stop() # Gather params. if not args.deepspeed and update_successful: @@ -755,551 +915,258 @@ def train_step(forward_step_func, data_iterator, # Vision momentum. if args.vision_pretraining and args.vision_pretraining_type == "dino": - unwrapped_model = unwrap_model(model[0], - (torchDDP, LocalDDP, Float16Module)) + unwrapped_model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) unwrapped_model.update_momentum(args.curr_iteration) # Update learning rate. if args.deepspeed: skipped_iter = 0 if update_successful else 1 - grad_norm = None + grad_norm = model[0].get_global_grad_norm() + # Empty unused memory. + if args.empty_unused_memory_level >= 2 and accelerator is not None: + accelerator.empty_cache() + # XXX: [saforem2]: ---------------------------------------------------- + # Is `num_zeros_in_grad` worth calculating (/ implementing) ?? + # the `Megatron`-specific implementation is at: + # [megatron.optimizer.clip_grads.count_zeros_fp32](./optimizer/clip_grads.py) + # For now, explicitly set to None + # --------------------------------------------------------------------- num_zeros_in_grad = None - loss_reduced = {} for key in losses_reduced[0]: losses_reduced_for_key = [x[key] for x in losses_reduced] - loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key) + loss_reduced[key] = sum(losses_reduced_for_key) / len( + losses_reduced_for_key + ) return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad + if update_successful: + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + opt_param_scheduler.step(increment=increment) + skipped_iter = 0 else: - if update_successful: - increment = get_num_microbatches() * \ - args.micro_batch_size * \ - args.data_parallel_size - opt_param_scheduler.step(increment=increment) - skipped_iter = 0 - else: - skipped_iter = 1 - - # Empty unused memory. - if args.empty_unused_memory_level >= 2: - get_accelerator().empty_cache() - - if mpu.is_pipeline_last_stage(ignore_virtual=True): - # Average loss across microbatches. - loss_reduced = {} - for key in losses_reduced[0]: - losses_reduced_for_key = [x[key] for x in losses_reduced] - loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key) - return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad - return {}, skipped_iter, grad_norm, num_zeros_in_grad - - -class InteropLoggingTool(Enum): - TENSORBOARD = 1 - WANDB = 2 - - -class interop_tool_logger: - def __init__(self, tb_writer=None, wandb_writer=None): - self.tb_writer = tb_writer - self.wandb_writer = wandb_writer - self.custom_x_axis = [] - self.custom_y_axis = {} - self.args = get_args() - if not hasattr(self.args, "logger_iteration"): - self.args.logger_iteration = 1 - - def is_enabled(self): - return self.tb_writer or self.wandb_writer - - def add_scalar(self, key, scalar_value, step, custom_step_name=None, \ - tool_list=[InteropLoggingTool.TENSORBOARD, InteropLoggingTool.WANDB]): - if self.tb_writer and \ - InteropLoggingTool.TENSORBOARD in tool_list: - self.tb_writer.add_scalar(key, scalar_value, step) - - if self.wandb_writer and \ - InteropLoggingTool.WANDB in tool_list: - if not custom_step_name: - self.wandb_writer.log({key: scalar_value}, step=step) - if self.args.logger_iteration < step: - # Updating iteration - self.args.logger_iteration = step - - else: - if custom_step_name not in self.custom_x_axis: - self.custom_x_axis.append(custom_step_name) - wandb.define_metric(custom_step_name) - - if key not in self.custom_y_axis: - self.custom_y_axis[key] = custom_step_name - wandb.define_metric(key, step_metric=custom_step_name) - - self.wandb_writer.log({key: scalar_value, custom_step_name: step}, \ - step=self.args.logger_iteration) - - - def add_scalar_to_tb(self, key, scalar_value, step): - return self.add_scalar(key, scalar_value, step, None, [InteropLoggingTool.TENSORBOARD]) - - def add_scalar_to_wandb(self, key, scalar_value, step, custom_step_name=None): - return self.add_scalar(key, scalar_value, step, custom_step_name, [InteropLoggingTool.WANDB]) - - def add_images(self, key, img_tensor, step=None): - if self.tb_writer: - self.tb_writer.add_images(key, img_tensor, step) - - if self.wandb_writer: - self.wandb_writer.log({key: wandb.Image(img_tensor)}, step) + skipped_iter = 1 + # Empty unused memory. + if args.empty_unused_memory_level >= 2 and accelerator is not None: + accelerator.empty_cache() -def training_log(loss_dict, total_loss_dict, learning_rate, iteration, - loss_scale, report_memory_flag, skipped_iter, - grad_norm, params_norm, num_zeros_in_grad, - model=None, optimizer=None): - """Log training information such as losses, timing, ....""" - args = get_args() - timers = get_timers() - writer = interop_tool_logger(tb_writer=get_tensorboard_writer(), \ - wandb_writer=get_wandb_writer()) - x_axis_samples = 'Samples' - x_axis_tokens = 'Tokens' - # Advanced, skipped, and Nan iterations. - advanced_iters_key = 'advanced iterations' - skipped_iters_key = 'skipped iterations' - nan_iters_key = 'nan iterations' - # Advanced iterations. - if not skipped_iter: - total_loss_dict[advanced_iters_key] = total_loss_dict.get( - advanced_iters_key, 0) + 1 - else: - if advanced_iters_key not in total_loss_dict: - total_loss_dict[advanced_iters_key] = 0 - # Skipped iterations. - total_loss_dict[skipped_iters_key] = total_loss_dict.get( - skipped_iters_key, 0) + skipped_iter - # Update losses and set nan iterations - got_nan = False - for key in loss_dict: - if not skipped_iter: - total_loss_dict[key] = total_loss_dict.get( - key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] - else: - value = loss_dict[key].float().sum().item() - is_nan = value == float('inf') or \ - value == -float('inf') or \ - value != value - got_nan = got_nan or is_nan - total_loss_dict[nan_iters_key] = total_loss_dict.get( - nan_iters_key, 0) + int(got_nan) - - # Logging. - timers_to_log = [ - 'forward-backward', - 'forward-compute', - 'backward-compute', - 'batch-generator', - 'forward-recv', - 'forward-send', - 'backward-recv', - 'backward-send', - 'forward-send-forward-recv', - 'forward-send-backward-recv', - 'backward-send-forward-recv', - 'backward-send-backward-recv', - 'forward-backward-send-forward-backward-recv', - 'layernorm-grads-all-reduce', - 'embedding-grads-all-reduce', - 'grads-all-reduce', - 'grads-reduce-scatter', - 'params-all-gather', - 'optimizer-copy-to-main-grad', - 'optimizer-unscale-and-check-inf', - 'optimizer-clip-main-grad', - 'optimizer-count-zeros', - 'optimizer-inner-step', - 'optimizer-copy-main-to-model-params', - 'optimizer'] - - # Calculate batch size. - batch_size = args.micro_batch_size * args.data_parallel_size * \ - get_num_microbatches() - - total_iterations = total_loss_dict[advanced_iters_key] + \ - total_loss_dict[skipped_iters_key] - - # Tensorboard values. - # Timer requires all the ranks to call. - if args.log_timers_to_tensorboard and \ - (iteration % args.tensorboard_log_interval == 0): - timers.write(timers_to_log, writer, iteration, - normalizer=total_iterations) - if writer.is_enabled() and (iteration % args.tensorboard_log_interval == 0): - writer.add_scalar('steps-vs-samples/y=steps,x=samples', iteration, args.consumed_train_samples, x_axis_samples) - writer.add_scalar('steps-vs-samples/y=samples,x=steps', args.consumed_train_samples, iteration) - writer.add_scalar('steps-vs-tokens/y=steps,x=tokens', iteration, args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('steps-vs-tokens/y=tokens,x=steps', args.consumed_train_tokens, iteration) - if args.log_learning_rate_to_tensorboard: - writer.add_scalar('learning-rate/learning-rate', learning_rate, iteration) - writer.add_scalar('learning-rate/learning-rate vs samples', learning_rate, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('learning-rate/learning-rate vs tokens', learning_rate, - args.consumed_train_tokens, x_axis_tokens) - if args.log_batch_size_to_tensorboard: - writer.add_scalar('batch-size/batch-size', batch_size, iteration) - writer.add_scalar('batch-size/batch-size vs samples', batch_size, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('batch-size/batch-size vs tokens', batch_size, - args.consumed_train_tokens, x_axis_tokens) - for key in loss_dict: - writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) - writer.add_scalar(f"lm-loss-training/{key}" + ' vs samples', loss_dict[key], - args.consumed_train_samples, x_axis_samples) - writer.add_scalar(f"lm-loss-training/{key}" + ' vs tokens', loss_dict[key], - args.consumed_train_tokens, x_axis_tokens) - if args.fp16 and loss_scale and args.log_loss_scale_to_tensorboard: - writer.add_scalar('loss-scale/loss-scale', loss_scale, iteration) - writer.add_scalar('loss-scale/loss-scale vs samples', loss_scale, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('loss-scale/loss-scale vs tokens', loss_scale, - args.consumed_train_tokens, x_axis_tokens) - if args.log_world_size_to_tensorboard: - writer.add_scalar('world-size/world-size', args.world_size, iteration) - writer.add_scalar('world-size/world-size vs samples', args.world_size, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('world-size/world-size vs tokens', args.world_size, - args.consumed_train_tokens, x_axis_tokens) - if grad_norm is not None: - writer.add_scalar('grad-norm/grad-norm', grad_norm, iteration) - writer.add_scalar('grad-norm/grad-norm vs samples', grad_norm, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('grad-norm/grad-norm vs tokens', grad_norm, - args.consumed_train_tokens, x_axis_tokens) - if num_zeros_in_grad is not None: - writer.add_scalar('num-zeros/num-zeros', num_zeros_in_grad, iteration) - writer.add_scalar('num-zeros/num-zeros vs samples', num_zeros_in_grad, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('num-zeros/num-zeros vs tokens', num_zeros_in_grad, - args.consumed_train_tokens, x_axis_tokens) - if params_norm is not None: - writer.add_scalar('params-norm/params-norm', params_norm, iteration) - writer.add_scalar('params-norm/params-norm vs samples', params_norm, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('params-norm/params-norm vs tokens', params_norm, - args.consumed_train_tokens, x_axis_tokens) - if hasattr(args, 'actual_seq_length'): - writer.add_scalar('seqlen/actual_seq_length', args.actual_seq_length, - iteration) - writer.add_scalar('seqlen/actual_seq_length vs samples', args.actual_seq_length, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('seqlen/actual_seq_length vs tokens', args.actual_seq_length, - args.consumed_train_tokens, x_axis_tokens) - if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - writer.add_scalar('seqlen/curriculum_seqlen', args.curriculum_seqlen, - iteration) - writer.add_scalar('seqlen/curriculum_seqlen vs samples', args.curriculum_seqlen, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('seqlen/curriculum_seqlen vs tokens', args.curriculum_seqlen, - args.consumed_train_tokens, x_axis_tokens) - if args.random_ltd: - writer.add_scalar('seqlen/random_ltd_reserved_length', args.random_ltd_reserved_length, - iteration) - writer.add_scalar('seqlen/random_ltd_reserved_length vs samples', args.random_ltd_reserved_length, - args.consumed_train_samples, x_axis_samples) - writer.add_scalar('seqlen/random_ltd_reserved_length vs tokens', args.random_ltd_reserved_length, - args.consumed_train_tokens, x_axis_tokens) - if args.log_memory_to_tensorboard: - mem_stats = torch.cuda.memory_stats() - writer.add_scalar( - "mem-reserved-bytes", - mem_stats["reserved_bytes.all.current"], - iteration, - ) - writer.add_scalar( - "mem-allocated-bytes", - mem_stats["allocated_bytes.all.current"], - iteration, - ) - writer.add_scalar( - "mem-allocated-count", - mem_stats["allocation.all.current"], - iteration, + if mpu.is_pipeline_last_stage(ignore_virtual=True): + # Average loss across microbatches. + loss_reduced = {} + for key in losses_reduced[0]: + losses_reduced_for_key = [x[key] for x in losses_reduced] + loss_reduced[key] = sum(losses_reduced_for_key) / len( + losses_reduced_for_key ) - - if iteration % args.tensorboard_log_interval == 0: - # This logging write various optimizer states to tensorboard. This - # feature may consume extra GPU memory thus is set at false by default. - if args.log_optimizer_states_to_tensorboard and optimizer is not None: - opt_stats = [0.0] * 8 - opt_stats_2 = [0.0] * 4 - - #TODO(billishyahao): Remove me after bf16_optimizer promotes its state. - if not hasattr(optimizer, "state"): - assert hasattr(optimizer, "optimizer"), f"Optimizer must have optimizer property." - optimizer.state = optimizer.optimizer.state - - for _, group in enumerate(optimizer.param_groups): - for _, param in enumerate(group['params']): - opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2 - opt_stats[1] += (torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt()).item())**2 - opt_stats[2] += (torch.norm(optimizer.state[param]['exp_avg']).item())**2 - opt_stats[3] += (torch.norm(param).item())**2 - opt_stats[4] += torch.norm(optimizer.state[param]['exp_avg_sq'],p=1).item() - opt_stats[5] += torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt(),p=1).item() - opt_stats[6] += torch.norm(optimizer.state[param]['exp_avg'],p=1).item() - opt_stats[7] += torch.norm(param,p=1).item() - opt_stats_2[0] = max(opt_stats_2[0], abs(optimizer.state[param]['exp_avg_sq'].max().item()), abs(optimizer.state[param]['exp_avg_sq'].min().item())) - opt_stats_2[1] = max(opt_stats_2[1], optimizer.state[param]['exp_avg_sq'].sqrt().abs_().max().item()) - opt_stats_2[2] = max(opt_stats_2[2], abs(optimizer.state[param]['exp_avg'].max().item()), abs(optimizer.state[param]['exp_avg'].min().item())) - opt_stats_2[3] = max(opt_stats_2[3], abs(param.max().item()), abs(param.min().item())) - # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) - if args.zero_stage > 0: - # ZeRO partiions optimizer states - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_sequence_data_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_sequence_data_parallel_group()) - - if args.tensor_model_parallel_size > 1: - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_tensor_model_parallel_group()) - - if args.pipeline_model_parallel_size > 1: - opt_stats = get_accelerator().FloatTensor(opt_stats) - torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group()) - opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) - torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, - group=mpu.get_pipeline_model_parallel_group()) - - # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) - if writer.is_enabled() and is_last_rank(): - writer.add_scalar('optimizer/variance_l2 vs tokens', opt_stats[0]**0.5, args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/variance_sqrt_l2 vs tokens', opt_stats[1]**0.5, args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/momentum_l2 vs tokens', opt_stats[2]**0.5, args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/weight_l2 vs tokens', opt_stats[3]**0.5, args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/variance_l1 vs tokens', opt_stats[4], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/variance_sqrt_l1 vs tokens', opt_stats[5], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/momentum_l1 vs tokens', opt_stats[6], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/weight_l1 vs tokens', opt_stats[7], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/variance_abs_max vs tokens', opt_stats_2[0], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/variance_sqrt_abs_max vs tokens', opt_stats_2[1], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/momentum_abs_max vs tokens', opt_stats_2[2], args.consumed_train_tokens, x_axis_tokens) - writer.add_scalar('optimizer/weight_abs_max vs tokens', opt_stats_2[3], args.consumed_train_tokens, x_axis_tokens) - - writer.add_scalar('optimizer/variance_l2', opt_stats[0]**0.5, iteration) - writer.add_scalar('optimizer/variance_sqrt_l2', opt_stats[1]**0.5, iteration) - writer.add_scalar('optimizer/momentum_l2', opt_stats[2]**0.5, iteration) - writer.add_scalar('optimizer/weight_l2', opt_stats[3]**0.5, iteration) - writer.add_scalar('optimizer/variance_l1', opt_stats[4], iteration) - writer.add_scalar('optimizer/variance_sqrt_l1', opt_stats[5], iteration) - writer.add_scalar('optimizer/momentum_l1', opt_stats[6], iteration) - writer.add_scalar('optimizer/weight_l1', opt_stats[7], iteration) - writer.add_scalar('optimizer/variance_abs_max', opt_stats_2[0], iteration) - writer.add_scalar('optimizer/variance_sqrt_abs_max', opt_stats_2[1], iteration) - writer.add_scalar('optimizer/momentum_abs_max', opt_stats_2[2], iteration) - writer.add_scalar('optimizer/weight_abs_max', opt_stats_2[3], iteration) - - if iteration % args.log_interval == 0: - elapsed_time = timers('interval-time').elapsed(barrier=True) - elapsed_time_per_iteration = elapsed_time / total_iterations - seq_len = args.seq_length - if hasattr(args, 'actual_seq_length'): - seq_len = args.actual_seq_length - samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( - model, - args, - elapsed_time, - total_iterations - ) - samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size - tokens_per_sec = samples_per_sec * seq_len - tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size - tokens_per_gpu_per_second = tokens_per_sec / args.world_size - tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args.data_parallel_size - - if writer.is_enabled(): - writer.add_scalar_to_wandb('throughput/iteration-time', elapsed_time_per_iteration, iteration) # 1000 ms / s - writer.add_scalar_to_wandb('throughput/samples_per_sec', samples_per_sec, iteration) - writer.add_scalar_to_wandb('throughput/samples_per_sec_per_replica', samples_per_sec_per_replica, iteration) - writer.add_scalar_to_wandb('throughput/tokens_per_sec', tokens_per_sec, iteration) - writer.add_scalar_to_wandb('throughput/tokens_per_sec_per_replica', tokens_per_sec_per_replica, iteration) - writer.add_scalar_to_wandb('throughput/tokens_per_gpu_per_sec', tokens_per_gpu_per_second, iteration) - writer.add_scalar_to_wandb('throughput/tokens_per_gpu_per_sec_per_replica', tokens_per_gpu_per_second_per_replica, iteration) - writer.add_scalar_to_wandb('throughput/tflops', tflops, iteration) - writer.add_scalar_to_wandb('throughput/approx_params_in_billions', approx_parameters_in_billions, iteration) - writer.add_scalar_to_wandb('throughput/elapsed_ms_per_iteration', elapsed_time_per_iteration, iteration) - if loss_dict is not None: - for k, v in loss_dict.items(): - writer.add_scalar_to_wandb(f'loss/{k}', v, iteration) - - if args.log_timers_to_tensorboard: - writer.add_scalar('iteration-time/iteration-time', - elapsed_time_per_iteration, iteration) - writer.add_scalar('iteration-time/iteration-time vs samples', - elapsed_time_per_iteration, args.consumed_train_samples, x_axis_samples) - writer.add_scalar('iteration-time/iteration-time vs tokens', - elapsed_time_per_iteration, args.consumed_train_tokens, x_axis_tokens) - log_string = ' iteration {:8d}/{:8d} |'.format( - iteration, args.train_iters) - log_string += ' consumed samples: {:12d} |'.format( - args.consumed_train_samples) - log_string += ' consumed tokens: {:12d} |'.format( - args.consumed_train_tokens) - log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( - elapsed_time_per_iteration * 1000.0) - log_string += ' learning rate: {:.3E} |'.format(learning_rate) - log_string += ' global batch size: {:5d} |'.format(batch_size) - - for key in total_loss_dict: - if key not in [advanced_iters_key, skipped_iters_key, - nan_iters_key]: - avg = total_loss_dict[key].item() / \ - float(max(1, total_loss_dict[advanced_iters_key])) - if avg > 0.0: - log_string += ' {}: {:.6E} |'.format(key, avg) - total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) - if loss_scale is not None: - log_string += ' loss scale: {:.1f} |'.format(loss_scale) - if grad_norm is not None: - log_string += ' grad norm: {:.3f} |'.format(grad_norm) - if num_zeros_in_grad is not None: - log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) - if params_norm is not None: - log_string += ' params norm: {:.3f} |'.format(params_norm) - if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) - if args.random_ltd: - log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length) - log_string += ' actual seqlen: {:5d} |'.format(seq_len) - log_string += ' number of skipped iterations: {:3d} |'.format( - total_loss_dict[skipped_iters_key]) - log_string += ' number of nan iterations: {:3d} |'.format( - total_loss_dict[nan_iters_key]) - log_string += ' samples per second: {:.3f} |'.format(samples_per_sec) - log_string += ' tokens per gpu per second (tgs): {:.3f} |'.format(tokens_per_gpu_per_second) - log_string += ' TFLOPs: {:.2f} |'.format(tflops) - total_loss_dict[advanced_iters_key] = 0 - total_loss_dict[skipped_iters_key] = 0 - total_loss_dict[nan_iters_key] = 0 - print_rank_last(log_string) - if report_memory_flag and learning_rate > 0.: - # Report memory after optimizer state has been initialized. - report_memory('(after {} iterations)'.format(iteration)) - report_memory_flag = False - timers.log(timers_to_log, normalizer=args.log_interval) - - return report_memory_flag + return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad + return {}, skipped_iter, grad_norm, num_zeros_in_grad +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler): timers = get_timers() + assert timers is not None # Extra barrier is added to make sure # all ranks report the max time. - timers('save-checkpoint', log_level=0).start(barrier=True) + # assert timers is not None + timers("save-checkpoint", log_level=0).start(barrier=True) save_checkpoint(iteration, model, optimizer, opt_param_scheduler) - timers('save-checkpoint').stop(barrier=True) - checkpoint_throughput_calculator(model, timers('save-checkpoint').elapsed(reset=False)) - timers.log(['save-checkpoint']) - - -def train(forward_step_func, model, optimizer, opt_param_scheduler, - train_data_iterator, valid_data_iterator, - process_non_loss_data_func): + timers("save-checkpoint").stop(barrier=True) + checkpoint_throughput_calculator( + model, timers("save-checkpoint").elapsed(reset=False) + ) + timers.log(["save-checkpoint"]) + + +@dlp.log +def train( + forward_step_func, + model, + optimizer, + opt_param_scheduler, + train_data_iterator, + valid_data_iterator, + process_non_loss_data_func, +): """Train the model function.""" args = get_args() timers = get_timers() - + accelerator = get_accelerator() + assert args is not None and timers is not None and accelerator is not None # Write args to tensorboard write_args_to_tensorboard() - - setup_profiler(args, get_accelerator().device_name()) - + assert accelerator is not None + setup_profiler(args, accelerator.device_name()) if args.random_ltd: # random-ltd requires different randomness on each rank import random - random.seed(args.seed + torch.distributed.get_rank()) + random.seed(args.seed + torch.distributed.get_rank()) # Turn on training mode which enables dropout. for model_module in model: model_module.train() - + grad_norm = None # Tracking loss. total_loss_dict = {} - + loss_dict = {"skipped_iter": 0} # Iterations. iteration = args.iteration - # Translate args to core configuration config = core_transformer_config_from_args(args) + num_skipped_iters = 0 if not args.deepspeed: config.grad_scale_func = optimizer.scale_loss config.timers = timers - - timers('interval-time', log_level=0).start(barrier=True) - print_datetime('before the start of training step') + timers("interval-time", log_level=0).start(barrier=True) + print_datetime("before the start of training step") report_memory_flag = True if args.random_ltd: assert model[0].random_ltd_enabled() - args.random_ltd_layer_num = model[0].random_ltd_scheduler.get_random_ltd_layer_num() - - while iteration < args.train_iters and (args.train_tokens is None or \ - args.consumed_train_tokens < args.train_tokens): + args.random_ltd_layer_num = model[ + 0 + ].random_ltd_scheduler.get_random_ltd_layer_num() + ranges_to_skip = None + if args.train_range_to_skip is not None: + assert ( + len(args.train_range_to_skip) % 2 == 0 + ), f"""Expected --train-range-to-skip to have an even number of values. + Received: {len(args.train_range_to_skip)} + """ + ranges_to_skip = list( + zip( + args.train_range_to_skip[::2], + args.train_range_to_skip[1::2], + ) + ) + while iteration < args.train_iters and ( + args.train_tokens is None or args.consumed_train_tokens < args.train_tokens + ): trigger(on_step_begin) update_num_microbatches(args.consumed_train_samples) if args.deepspeed: # inform deepspeed of any batch size changes - global_batch_size = mpu.get_data_parallel_world_size() * \ - args.micro_batch_size * \ - get_num_microbatches() + global_batch_size = ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * get_num_microbatches() + ) model[0].set_train_batch_size(global_batch_size) - if args.curriculum_learning_legacy and not args.no_pipeline_parallel: - curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ - args.iteration + 1) + curriculum_seqlen = args.curriculum_scheduler.update_difficulty( + args.iteration + 1 + ) if iteration == 0 or curriculum_seqlen != args.curriculum_seqlen: if args.use_rotary_position_embeddings: update_rotary_pos_emb(curriculum_seqlen) args.curriculum_seqlen = curriculum_seqlen args.curr_iteration = iteration - loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \ - train_step(forward_step_func, - train_data_iterator, - model, - optimizer, - opt_param_scheduler, - config) + if ranges_to_skip is not None and any( + [i <= (iteration + 1) <= j for (i, j) in ranges_to_skip] + ): + log.info(f"Caught {iteration + 1} in 'ranges_to_skip', skipping!") + skipped_iter = 1 + num_skipped_iters += 1 + num_zeros_in_grad = None + gas = args.deepspeed_config_dict["gradient_accumulation_steps"] + for microstep in range(gas): + _batch = next(train_data_iterator) + _tokens = _batch["text"] + if ( + iteration < 10 + and os.environ.get("DUMP_SKIPPED_ITERS", None) + and RANK == 0 + ): + log.info(f"{_tokens.shape}, {len(train_data_iterator)=}") + log.info( + f"{iteration=} [{microstep}/{gas}]: ({_tokens.shape})\n{_tokens[:10]=}" + ) + + increment = ( + get_num_microbatches() * args.micro_batch_size * args.data_parallel_size + ) + model[0].skipped_steps += 1 + model[0].global_steps += 1 + model[0].micro_steps += 1 + model[0].global_samples += model[0].train_batch_size() + opt_param_scheduler.step(increment=increment) + else: + if os.getenv("TORCH_PROFILER_ENABLE") == "2": + from torch.profiler import profile, ProfilerActivity + + try: + activities = [ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ProfilerActivity.XPU, # type:ignore + ] + except Exception: + log.warning("TORCH PROFILER WARNING: XPU is not supported") + activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] + with profile(activities=activities) as prof: + loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step( + forward_step_func, + train_data_iterator, + model, + optimizer, + opt_param_scheduler, + config, + ) + prof.export_chrome_trace( + f"{args.trace_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}-step{iteration}.json" + ) + else: + loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step( + forward_step_func, + train_data_iterator, + model, + optimizer, + opt_param_scheduler, + config, + ) iteration += 1 args.iteration = iteration - new_samples = mpu.get_data_parallel_world_size() * \ - args.micro_batch_size * \ - get_num_microbatches() + new_samples = ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * get_num_microbatches() + ) args.consumed_train_samples += new_samples # This actual_seq_length is used for actual consumed tokens calculation, flops calculation, and logging. args.actual_seq_length = args.seq_length if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: args.actual_seq_length = args.curriculum_seqlen if args.random_ltd: - args.random_ltd_reserved_length = model[0].random_ltd_scheduler.get_current_seq() + args.random_ltd_reserved_length = model[ + 0 + ].random_ltd_scheduler.get_current_seq() if args.random_ltd_reserved_length < args.actual_seq_length: - args.actual_seq_length = (args.actual_seq_length * (args.num_layers - args.random_ltd_layer_num) + args.random_ltd_reserved_length * args.random_ltd_layer_num) // args.num_layers + args.actual_seq_length = ( + args.actual_seq_length + * (args.num_layers - args.random_ltd_layer_num) + + args.random_ltd_reserved_length * args.random_ltd_layer_num + ) // args.num_layers if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - if hasattr(args, 'data_efficiency_curriculum_learning_numel'): - act_mbsz = args.data_efficiency_curriculum_learning_numel / args.curriculum_seqlen + if hasattr(args, "data_efficiency_curriculum_learning_numel"): + act_mbsz = ( + args.data_efficiency_curriculum_learning_numel + / args.curriculum_seqlen + ) act_token = act_mbsz * args.actual_seq_length - args.consumed_train_tokens += mpu.get_data_parallel_world_size() * \ - get_num_microbatches() * act_token + args.consumed_train_tokens += ( + mpu.get_data_parallel_world_size() + * get_num_microbatches() + * act_token + ) else: args.consumed_train_tokens += new_samples * args.actual_seq_length else: args.consumed_train_tokens += new_samples * args.actual_seq_length - # Logging. if args.deepspeed: - if hasattr(model[0].optimizer, 'cur_scale'): + if hasattr(model[0].optimizer, "cur_scale"): loss_scale = model[0].optimizer.cur_scale else: loss_scale = None @@ -1308,91 +1175,103 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, params_norm = None if args.log_params_norm: params_norm = calc_params_l2_norm(model) - report_memory_flag = training_log(loss_dict, total_loss_dict, - optimizer.param_groups[0]['lr'], - iteration, loss_scale, - report_memory_flag, skipped_iter, - grad_norm, params_norm, num_zeros_in_grad, - model, optimizer) - + report_memory_flag = training_log( + loss_dict, + total_loss_dict, + optimizer.param_groups[0]["lr"], + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model, + optimizer, + ) # Autoresume - if args.adlr_autoresume and \ - (iteration % args.adlr_autoresume_interval == 0): - check_adlr_autoresume_termination(iteration, model, optimizer, - opt_param_scheduler) - + if args.adlr_autoresume and (iteration % args.adlr_autoresume_interval == 0): + check_adlr_autoresume_termination( + iteration, model, optimizer, opt_param_scheduler + ) # Evaluation - if args.eval_interval and iteration % args.eval_interval == 0 and \ - args.do_valid: - prefix = 'iteration {}'.format(iteration) - evaluate_and_print_results(prefix, forward_step_func, - valid_data_iterator, model, - iteration, process_non_loss_data_func, - config, False) - + if args.eval_interval and iteration % args.eval_interval == 0 and args.do_valid: + prefix = "iteration {}".format(iteration) + evaluate_and_print_results( + prefix, + forward_step_func, + valid_data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + False, + ) # Checkpointing saved_checkpoint = False if args.exit_signal_handler: signal_handler = get_signal_handler() - if any(signal_handler.signals_received()): - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) - print_datetime('exiting program after receiving SIGTERM.') + # if any(signal_handler.signals_received()): + if signal_handler is not None and any(signal_handler.signals_received()): + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) + print_datetime("exiting program after receiving SIGTERM.") sys.exit() - - if args.save and args.save_interval and \ - iteration % args.save_interval == 0: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) + if args.save and args.save_interval and iteration % args.save_interval == 0: + save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler) saved_checkpoint = True - # Exiting based on duration if args.exit_duration_in_mins: train_time = (time.time() - _TRAIN_START_TIME) / 60.0 - done_cuda = get_accelerator().IntTensor( - [train_time > args.exit_duration_in_mins]) - torch.distributed.all_reduce( - done_cuda, op=torch.distributed.ReduceOp.MAX) + done_cuda = accelerator.IntTensor([train_time > args.exit_duration_in_mins]) + torch.distributed.all_reduce(done_cuda, op=torch.distributed.ReduceOp.MAX) done = done_cuda.item() if done: if not saved_checkpoint: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) - print_datetime('exiting program after {} minutes'.format(train_time)) + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) + print_datetime("exiting program after {} minutes".format(train_time)) sys.exit() - # Exiting based on iterations if args.exit_interval and iteration % args.exit_interval == 0: if args.save and not saved_checkpoint: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) torch.distributed.barrier() - print_datetime('exiting program at iteration {}'.format(iteration)) + print_datetime("exiting program at iteration {}".format(iteration)) sys.exit() trigger(on_step_end) - # Exiting based on kill switch file if found_kill_switch(): if args.save and not saved_checkpoint: - save_checkpoint_and_time(iteration, model, optimizer, - opt_param_scheduler) + save_checkpoint_and_time( + iteration, model, optimizer, opt_param_scheduler + ) torch.distributed.barrier() - print_datetime(f"Detected kill switch at {args.kill_switch_file}, " - f"iteration={iteration}. Exiting") + print_datetime( + f"Detected kill switch at {args.kill_switch_file}, " + f"iteration={iteration}. Exiting" + ) sys.exit() - return iteration -def evaluate(forward_step_func, - data_iterator, - model, - process_non_loss_data_func, - config, - verbose=False): +@dlp.log +def evaluate( + forward_step_func, + data_iterator, + model, + process_non_loss_data_func, + config, + verbose=False, +): """Evaluation.""" args = get_args() - + accelerator = get_accelerator() + assert args is not None and accelerator is not None if args.vision_pretraining and args.vision_pretraining_type == "dino": compute_feature_bank(model) @@ -1414,73 +1293,82 @@ def evaluate(forward_step_func, total_loss_dict = {} + num_microbatches = get_num_microbatches() + assert num_microbatches is not None + forward_backward_func = get_forward_backward_func() + with torch.no_grad(): iteration = 0 while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: - print_rank_0('Evaluating iter {}/{}'.format(iteration, - args.eval_iters)) + log.info("Evaluating iter {}/{}".format(iteration, args.eval_iters)) - forward_backward_func = get_forward_backward_func() # Don't care about timing during evaluation config.timers = None if args.deepspeed and args.ds_pipeline_enabled: # DeepSpeed uses eval_batch() and already aggregates losses. assert isinstance(model, list) and len(model) == 1 loss = model[0].eval_batch(data_iterator) - loss_dicts = [{'lm loss' : loss}] * get_num_microbatches() + loss_dicts = [{"lm loss": loss}] * num_microbatches else: loss_dicts = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, - forward_only=True) + forward_only=True, + ) config.timers = get_timers() # Empty unused memory if args.empty_unused_memory_level >= 1: - get_accelerator().empty_cache() + accelerator.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): # Reduce across processes. for loss_dict in loss_dicts: for key in loss_dict: - if 'moe' not in key: - total_loss_dict[key] = total_loss_dict.get( - key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] - - args.consumed_valid_samples += mpu.get_data_parallel_world_size() \ - * args.micro_batch_size \ - * get_num_microbatches() + if "moe" not in key: + total_loss_dict[key] = ( + total_loss_dict.get(key, accelerator.FloatTensor([0.0])) + + loss_dict[key] + ) + + args.consumed_valid_samples += ( + mpu.get_data_parallel_world_size() + * args.micro_batch_size + * num_microbatches + ) collected_non_loss_data = None if process_non_loss_data_func is not None and is_last_rank(): collected_non_loss_data = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, model=model, - num_microbatches=get_num_microbatches(), + num_microbatches=num_microbatches, seq_length=args.seq_length, micro_batch_size=args.micro_batch_size, decoder_seq_length=args.decoder_seq_length, forward_only=True, - collect_non_loss_data=True) + collect_non_loss_data=True, + ) # Move model back to the train mode. for model_module in model: model_module.train() for key in total_loss_dict: - total_loss_dict[key] /= args.eval_iters * get_num_microbatches() + total_loss_dict[key] /= args.eval_iters * num_microbatches if args.curriculum_learning_legacy and not args.no_pipeline_parallel: # roll back to actual curriculum seqlen at the end of eval. - args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ - args.iteration + 1) + args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( + args.iteration + 1 + ) if args.curriculum_seqlen < args.seq_length: if args.use_rotary_position_embeddings: update_rotary_pos_emb(args.curriculum_seqlen) @@ -1488,55 +1376,92 @@ def evaluate(forward_step_func, return total_loss_dict, collected_non_loss_data -def evaluate_and_print_results(prefix, forward_step_func, - data_iterator, model, - iteration, process_non_loss_data_func, config, - verbose=False, write_to_tensorboard=True, test=False): + +@dlp.log +def evaluate_and_print_results( + prefix, + forward_step_func, + data_iterator, + model, + iteration, + process_non_loss_data_func, + config, + verbose=False, + write_to_tensorboard=True, + test=False, +): """Helper function to evaluate and dump results on screen.""" args = get_args() + assert args is not None if write_to_tensorboard: - writer = interop_tool_logger(tb_writer=get_tensorboard_writer(), wandb_writer=get_wandb_writer()) + writer = get_tensorboard_writer() else: - writer = interop_tool_logger() - x_axis_samples = 'Samples' - x_axis_tokens = 'Tokens' + writer = None total_loss_dict, collected_non_loss_data = evaluate( - forward_step_func, data_iterator, model, - process_non_loss_data_func, config, verbose) - string = ' validation loss at {} | '.format(prefix) + forward_step_func, + data_iterator, + model, + process_non_loss_data_func, + config, + verbose, + ) + key = "test" if test else "val" + if wandb is not None and wandb.run is not None: + wandb.log({ + f"{key}/iteration": iteration, + **{f"{key}/{k}": v for k, v in total_loss_dict.items()}, + **{ + f"{key}/ppl_{k}": math.exp(min(20, v.item())) + for k, v in total_loss_dict.items() + }, + }) + string = " validation loss at {} | ".format(prefix) for key in total_loss_dict: - string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) + string += f"{key} value={total_loss_dict[key].item():.6f}" ppl = math.exp(min(20, total_loss_dict[key].item())) - string += '{} PPL: {:.6E} | '.format(key, ppl) - if writer.is_enabled() and is_last_rank(): - data_type = 'test' if test else 'validation' - writer.add_scalar(f'lm-loss-validation/{key} {data_type}', - total_loss_dict[key].item(), - iteration) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} vs samples', - total_loss_dict[key].item(), - args.consumed_train_samples, - x_axis_samples) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} vs tokens', - total_loss_dict[key].item(), - args.consumed_train_tokens, - x_axis_tokens) + string += f"{key} PPL={ppl:.6f}" + # string += '{} PPL={:.6f} | '.format(key, ppl) + if writer is not None and is_last_rank(): + data_type = "test" if test else "validation" + writer.add_scalar( + f"lm-loss-validation/{key} {data_type}", + total_loss_dict[key].item(), + iteration, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} vs samples", + total_loss_dict[key].item(), + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} vs tokens", + total_loss_dict[key].item(), + args.consumed_train_tokens, + ) if args.log_validation_ppl_to_tensorboard: - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl', ppl, - iteration) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl vs samples', - ppl, args.consumed_train_samples, x_axis_samples) - writer.add_scalar(f'lm-loss-validation/{key} {data_type} ppl vs tokens', - ppl, args.consumed_train_tokens, x_axis_tokens) - - if process_non_loss_data_func is not None and writer.is_enabled() and is_last_rank(): + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl", ppl, iteration + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl vs samples", + ppl, + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-validation/{key} {data_type} ppl vs tokens", + ppl, + args.consumed_train_tokens, + ) + + if process_non_loss_data_func is not None and writer and is_last_rank(): process_non_loss_data_func(collected_non_loss_data, iteration, writer) length = len(string) + 1 - print_rank_last('-' * length) - print_rank_last(string) - print_rank_last('-' * length) + log.info("-" * length) + log.info(string) + log.info("-" * length) + return total_loss_dict def cyclic_iter(iter): @@ -1545,122 +1470,147 @@ def cyclic_iter(iter): yield x +@dlp.log +@ez.dist.timeitlogit(rank=RANK) def build_train_valid_test_datasets(build_train_valid_test_datasets_provider): """Build pretraining datasets.""" args = get_args() # Number of train/valid/test samples. + assert args is not None if args.train_samples: train_samples = args.train_samples else: train_samples = args.train_iters * args.global_batch_size - eval_iters = (args.train_iters // args.eval_interval + 1) * \ - args.eval_iters + eval_iters = (args.train_iters // args.eval_interval + 1) * args.eval_iters test_iters = args.eval_iters - train_val_test_num_samples = [train_samples, - eval_iters * args.global_batch_size, - test_iters * args.global_batch_size] - print_rank_0(' > datasets target sizes (minimum size):') - print_rank_0(' train: {}'.format(train_val_test_num_samples[0])) - print_rank_0(' validation: {}'.format(train_val_test_num_samples[1])) - print_rank_0(' test: {}'.format(train_val_test_num_samples[2])) + train_val_test_num_samples = [ + train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size, + ] + log.info(" > datasets target sizes (minimum size):") + log.info(" train: {}".format(train_val_test_num_samples[0])) + log.info(" validation: {}".format(train_val_test_num_samples[1])) + log.info(" test: {}".format(train_val_test_num_samples[2])) # Build the datasets. return build_train_valid_test_datasets_provider(train_val_test_num_samples) -def build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider): """Build pretraining data loaders.""" - args = get_args() - + accelerator = get_accelerator() + assert args is not None and accelerator is not None (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) - - print_rank_0('> building train, validation, and test datasets ...') - + log.info("> building train, validation, and test datasets ...") # Backward compatibility, assume fixed batch size. if args.iteration > 0 and args.consumed_train_samples == 0: - assert args.train_samples is None, \ - 'only backward compatiblity support for iteration-based training' + assert ( + args.train_samples is None + ), "only backward compatiblity support for iteration-based training" args.consumed_train_samples = args.iteration * args.global_batch_size if args.iteration > 0 and args.consumed_valid_samples == 0: if args.train_samples is None: - args.consumed_valid_samples = (args.iteration // args.eval_interval) * \ - args.eval_iters * args.global_batch_size - + args.consumed_valid_samples = ( + (args.iteration // args.eval_interval) + * args.eval_iters + * args.global_batch_size + ) # Data loader only on rank 0 of each model parallel group. - ds_sequence_parallel = mpu.get_sequence_parallel_world_size() > 1 or args.force_ds_sequence_parallel - rank_in_parallel_group = mpu.get_sequence_parallel_rank() if ds_sequence_parallel else mpu.get_tensor_model_parallel_rank() + ds_sequence_parallel = ( + mpu.get_sequence_parallel_world_size() > 1 or args.force_ds_sequence_parallel + ) + rank_in_parallel_group = ( + mpu.get_sequence_parallel_rank() + if ds_sequence_parallel + else mpu.get_tensor_model_parallel_rank() + ) if rank_in_parallel_group == 0: # Build datasets. train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - build_train_valid_test_datasets_provider) - + build_train_valid_test_datasets_provider + ) # Build dataloders. train_dataloader = build_pretraining_data_loader( - train_ds, args.consumed_train_samples) + train_ds, args.consumed_train_samples + ) valid_dataloader = build_pretraining_data_loader( - valid_ds, args.consumed_valid_samples) + valid_ds, args.consumed_valid_samples + ) test_dataloader = build_pretraining_data_loader(test_ds, 0) - # Flags to know if we need to do training/validation/testing. do_train = train_dataloader is not None and args.train_iters > 0 do_valid = valid_dataloader is not None and args.eval_iters > 0 do_test = test_dataloader is not None and args.eval_iters > 0 # Need to broadcast num_tokens and num_type_tokens. - flags = get_accelerator().LongTensor( - [int(do_train), int(do_valid), int(do_test)]) + flags = accelerator.LongTensor([int(do_train), int(do_valid), int(do_test)]) else: - flags = get_accelerator().LongTensor([0, 0, 0]) - + flags = accelerator.LongTensor([0, 0, 0]) # Broadcast num tokens. if ds_sequence_parallel: - torch.distributed.broadcast(flags, - mpu.get_sequence_parallel_src_rank(), - group=mpu.get_sequence_parallel_group()) + torch.distributed.broadcast( + flags, + mpu.get_sequence_parallel_src_rank(), + group=mpu.get_sequence_parallel_group(), + ) else: - torch.distributed.broadcast(flags, - mpu.get_tensor_model_parallel_src_rank(), - group=mpu.get_tensor_model_parallel_group()) + torch.distributed.broadcast( + flags, + mpu.get_tensor_model_parallel_src_rank(), + group=mpu.get_tensor_model_parallel_group(), + ) args.do_train = flags[0].item() args.do_valid = flags[1].item() args.do_test = flags[2].item() - return train_dataloader, valid_dataloader, test_dataloader -def build_train_valid_test_data_iterators( - build_train_valid_test_datasets_provider): +@dlp.log +@ez.dist.timeitlogit(rank=RANK) +def build_train_valid_test_data_iterators(build_train_valid_test_datasets_provider): """Build pretraining data iterators.""" args = get_args() + assert args is not None # Build loaders. - train_dataloader, valid_dataloader, test_dataloader = \ - build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider) + train_dataloader, valid_dataloader, test_dataloader = ( + build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider) + ) # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic'] + assert dl_type in ["single", "cyclic"] if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(train_dataloader)) + train_data_iterator = ( + iter(train_dataloader) + if dl_type == "single" + else iter(cyclic_iter(train_dataloader)) + ) else: train_data_iterator = None if valid_dataloader is not None: - valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(valid_dataloader)) + valid_data_iterator = ( + iter(valid_dataloader) + if dl_type == "single" + else iter(cyclic_iter(valid_dataloader)) + ) else: valid_data_iterator = None if test_dataloader is not None: - test_data_iterator = iter(test_dataloader) if dl_type == 'single' \ - else iter(cyclic_iter(test_dataloader)) + test_data_iterator = ( + iter(test_dataloader) + if dl_type == "single" + else iter(cyclic_iter(test_dataloader)) + ) else: test_data_iterator = None diff --git a/megatron/training_log.py b/megatron/training_log.py new file mode 100644 index 0000000000..3eb96c392d --- /dev/null +++ b/megatron/training_log.py @@ -0,0 +1,669 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +""" +training_log.py +""" + +import logging +import os + +from deepspeed import get_accelerator +import ezpz as ez +import torch + +from megatron.core import mpu +from megatron.global_vars import ( + get_args, + get_num_microbatches, + get_tensorboard_writer, + get_timers, +) +from megatron.utils import ( + Profile, + is_last_rank, + report_memory, + throughput_calculator, + num_floating_point_operations, +) + + +RANK: int = ez.get_rank() +WORLD_SIZE: int = ez.get_world_size() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE: torch.device = torch.device(DEVICE_TYPE) + +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + +try: + import wandb +except (ImportError, ModuleNotFoundError): + wandb = None + + +dlp = Profile("TRAINING_LOG") + + +@dlp.log +def training_log( + loss_dict, + total_loss_dict, + learning_rate, + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model=None, + optimizer=None, +): + """Log training information such as losses, timing, ....""" + args = get_args() + accelerator = get_accelerator() + timers = get_timers() + writer = get_tensorboard_writer() + assert args is not None and timers is not None and accelerator is not None + wandb_metrics = {} + # Advanced, skipped, and Nan iterations. + advanced_iters_key = "advanced iterations" + skipped_iters_key = "skipped iterations" + nan_iters_key = "nan iterations" + # Advanced iterations. + if not skipped_iter: + total_loss_dict[advanced_iters_key] = ( + total_loss_dict.get(advanced_iters_key, 0) + 1 + ) + else: + if advanced_iters_key not in total_loss_dict: + total_loss_dict[advanced_iters_key] = 0 + # Skipped iterations. + total_loss_dict[skipped_iters_key] = ( + total_loss_dict.get(skipped_iters_key, 0) + skipped_iter + ) + # Update losses and set nan iterations + got_nan = False + for key in loss_dict: + if not skipped_iter: + total_loss_dict[key] = ( + total_loss_dict.get(key, accelerator.FloatTensor([0.0])) + + loss_dict[key] + ) + else: + try: + value = loss_dict[key].float().sum().item() + except AttributeError: + value = loss_dict[key] + is_nan = value == float("inf") or value == -float("inf") or value != value + got_nan = got_nan or is_nan + total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int( + got_nan + ) + + # Logging. + timers_to_log = [ + "forward-backward", + "forward-compute", + "backward-compute", + "batch-generator", + "forward-recv", + "forward-send", + "backward-recv", + "backward-send", + "forward-send-forward-recv", + "forward-send-backward-recv", + "backward-send-forward-recv", + "backward-send-backward-recv", + "forward-backward-send-forward-backward-recv", + "layernorm-grads-all-reduce", + "embedding-grads-all-reduce", + "grads-all-reduce", + "grads-reduce-scatter", + "params-all-gather", + "optimizer-copy-to-main-grad", + "optimizer-unscale-and-check-inf", + "optimizer-clip-main-grad", + "optimizer-count-zeros", + "optimizer-inner-step", + "optimizer-copy-main-to-model-params", + "optimizer", + ] + + # Calculate batch size. + batch_size = ( + args.micro_batch_size * args.data_parallel_size * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + total_loss_dict[skipped_iters_key] + ) + + # Tensorboard values. + # Timer requires all the ranks to call. + if args.log_timers_to_tensorboard and ( + iteration % args.tensorboard_log_interval == 0 + ): + timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) + if writer and (iteration % args.tensorboard_log_interval == 0): + writer.add_scalar( + "steps-vs-samples/y=steps,x=samples", iteration, args.consumed_train_samples + ) + writer.add_scalar( + "steps-vs-samples/y=samples,x=steps", args.consumed_train_samples, iteration + ) + writer.add_scalar( + "steps-vs-tokens/y=steps,x=tokens", iteration, args.consumed_train_tokens + ) + writer.add_scalar( + "steps-vs-tokens/y=tokens,x=steps", args.consumed_train_tokens, iteration + ) + if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + "learning-rate/iteration": iteration, + "learning-rate/learning-rate": learning_rate, + } + writer.add_scalar("learning-rate/learning-rate", learning_rate, iteration) + writer.add_scalar( + "learning-rate/learning-rate vs samples", + learning_rate, + args.consumed_train_samples, + ) + writer.add_scalar( + "learning-rate/learning-rate vs tokens", + learning_rate, + args.consumed_train_tokens, + ) + if args.log_batch_size_to_tensorboard: + writer.add_scalar("batch-size/batch-size", batch_size, iteration) + writer.add_scalar( + "batch-size/batch-size vs samples", + batch_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "batch-size/batch-size vs tokens", + batch_size, + args.consumed_train_tokens, + ) + wandb_metrics |= { + "lm-loss-training/iteration": iteration, + "lm-loss-training/consumed_train_tokens": args.consumed_train_tokens, + } + for key in loss_dict: + wandb_metrics |= {f"lm-loss-training/{key}": loss_dict[key]} + writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs samples", + loss_dict[key], + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs tokens", + loss_dict[key], + args.consumed_train_tokens, + ) + if args.fp16 and loss_scale and args.log_loss_scale_to_tensorboard: + writer.add_scalar("loss-scale/loss-scale", loss_scale, iteration) + writer.add_scalar( + "loss-scale/loss-scale vs samples", + loss_scale, + args.consumed_train_samples, + ) + writer.add_scalar( + "loss-scale/loss-scale vs tokens", + loss_scale, + args.consumed_train_tokens, + ) + if args.log_world_size_to_tensorboard: + writer.add_scalar("world-size/world-size", args.world_size, iteration) + writer.add_scalar( + "world-size/world-size vs samples", + args.world_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "world-size/world-size vs tokens", + args.world_size, + args.consumed_train_tokens, + ) + if grad_norm is not None: + wandb_metrics |= {"training/grad-norm": grad_norm} + writer.add_scalar("grad-norm/grad-norm", grad_norm, iteration) + writer.add_scalar( + "grad-norm/grad-norm vs samples", grad_norm, args.consumed_train_samples + ) + writer.add_scalar( + "grad-norm/grad-norm vs tokens", grad_norm, args.consumed_train_tokens + ) + if num_zeros_in_grad is not None: + wandb_metrics |= {"training/num-zeros": num_zeros_in_grad} + writer.add_scalar("num-zeros/num-zeros", num_zeros_in_grad, iteration) + writer.add_scalar( + "num-zeros/num-zeros vs samples", + num_zeros_in_grad, + args.consumed_train_samples, + ) + writer.add_scalar( + "num-zeros/num-zeros vs tokens", + num_zeros_in_grad, + args.consumed_train_tokens, + ) + if params_norm is not None: + wandb_metrics |= {"training/params-norm": params_norm} + writer.add_scalar("params-norm/params-norm", params_norm, iteration) + writer.add_scalar( + "params-norm/params-norm vs samples", + params_norm, + args.consumed_train_samples, + ) + writer.add_scalar( + "params-norm/params-norm vs tokens", + params_norm, + args.consumed_train_tokens, + ) + if hasattr(args, "actual_seq_length"): + writer.add_scalar( + "seqlen/actual_seq_length", args.actual_seq_length, iteration + ) + writer.add_scalar( + "seqlen/actual_seq_length vs samples", + args.actual_seq_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/actual_seq_length vs tokens", + args.actual_seq_length, + args.consumed_train_tokens, + ) + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + writer.add_scalar( + "seqlen/curriculum_seqlen", args.curriculum_seqlen, iteration + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs samples", + args.curriculum_seqlen, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs tokens", + args.curriculum_seqlen, + args.consumed_train_tokens, + ) + if args.random_ltd: + writer.add_scalar( + "seqlen/random_ltd_reserved_length", + args.random_ltd_reserved_length, + iteration, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs samples", + args.random_ltd_reserved_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs tokens", + args.random_ltd_reserved_length, + args.consumed_train_tokens, + ) + if args.log_memory_to_tensorboard: + mem_stats = torch.cuda.memory_stats() + writer.add_scalar( + "mem-reserved-bytes", + mem_stats["reserved_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-bytes", + mem_stats["allocated_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-count", + mem_stats["allocation.all.current"], + iteration, + ) + if iteration % args.tensorboard_log_interval == 0: + # This logging write various optimizer states to tensorboard. This + # feature may consume extra GPU memory thus is set at false by default. + if args.log_optimizer_states_to_tensorboard and optimizer is not None: + opt_stats = [0.0] * 8 + opt_stats_2 = [0.0] * 4 + for _, group in enumerate(optimizer.param_groups): + for _, param in enumerate(group["params"]): + state_param = getattr(optimizer, "state", None) + if state_param is not None: + exp_avg_sq = state_param.get("exp_avg_sq", torch.tensor(0.0)) + exp_avg = state_param.get("exp_avg", torch.tensor(0.0)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()), + ) + opt_stats_2[1] = max( + opt_stats_2[1], exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()), + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()), + ) + # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + if args.zero_stage > 0: + # ZeRO partiions optimizer states + # opt_stats = opt_stats.clone().detach() + # opt_stats = get_accelerator().FloatTensor + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_sequence_data_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_sequence_data_parallel_group(), + ) + + if args.tensor_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_tensor_model_parallel_group() + ) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_tensor_model_parallel_group(), + ) + + if args.pipeline_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() + opt_stats = accelerator.FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_pipeline_model_parallel_group() + ) + # opt_stats_2 = opt_stats_2.clone().detach() + opt_stats_2 = accelerator.FloatTensor(opt_stats_2) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_pipeline_model_parallel_group(), + ) + wandb_metrics |= { + "optimizer/learning_rate": learning_rate, + "optimizer/iteration": args.iteration, + "optimizer/consumed_train_tokens": args.consumed_train_tokens, + "optimizer/variance_l2": opt_stats[0] ** 0.5, + "optimizer/variance_sqrt_l2": opt_stats[1] ** 0.5, + "optimizer/momentum_l2": opt_stats[2] ** 0.5, + "optimizer/weight_l2": opt_stats[3] ** 0.5, + "optimizer/variance_l1": opt_stats[4], + "optimizer/variance_sqrt_l1": opt_stats[5], + "optimizer/momentum_l1": opt_stats[6], + "optimizer/weight_l1": opt_stats[7], + "optimizer/variance_abs_max": opt_stats_2[0], + "optimizer/variance_sqrt_abs_max": opt_stats_2[1], + "optimizer/momentum_abs_max": opt_stats_2[2], + "optimizer/weight_abs_max": opt_stats_2[3], + } + # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + if writer and is_last_rank(): + writer.add_scalar( + "optimizer/variance_l2 vs tokens", + opt_stats[0] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2 vs tokens", + opt_stats[1] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l2 vs tokens", + opt_stats[2] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l2 vs tokens", + opt_stats[3] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l1 vs tokens", + opt_stats[4], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l1 vs tokens", + opt_stats[5], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l1 vs tokens", + opt_stats[6], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l1 vs tokens", + opt_stats[7], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_abs_max vs tokens", + opt_stats_2[0], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max vs tokens", + opt_stats_2[1], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_abs_max vs tokens", + opt_stats_2[2], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_abs_max vs tokens", + opt_stats_2[3], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l2", opt_stats[0] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2", opt_stats[1] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/momentum_l2", opt_stats[2] ** 0.5, iteration + ) + writer.add_scalar("optimizer/weight_l2", opt_stats[3] ** 0.5, iteration) + writer.add_scalar("optimizer/variance_l1", opt_stats[4], iteration) + writer.add_scalar("optimizer/variance_sqrt_l1", opt_stats[5], iteration) + writer.add_scalar("optimizer/momentum_l1", opt_stats[6], iteration) + writer.add_scalar("optimizer/weight_l1", opt_stats[7], iteration) + writer.add_scalar( + "optimizer/variance_abs_max", opt_stats_2[0], iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max", opt_stats_2[1], iteration + ) + writer.add_scalar( + "optimizer/momentum_abs_max", opt_stats_2[2], iteration + ) + writer.add_scalar("optimizer/weight_abs_max", opt_stats_2[3], iteration) + + assert args is not None + assert timers is not None + if iteration % args.log_interval == 0: + elapsed_time = timers("interval-time").elapsed(barrier=True) + elapsed_time_per_iteration = elapsed_time / total_iterations + seq_len = args.seq_length + if hasattr(args, "actual_seq_length"): + seq_len = args.actual_seq_length + samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( + model, args, elapsed_time, total_iterations + ) + samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + tokens_per_sec = samples_per_sec * seq_len + tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + tokens_per_gpu_per_second = tokens_per_sec / args.world_size + tokens_per_gpu_per_second_per_replica = ( + tokens_per_gpu_per_second / args.data_parallel_size + ) + # NOTE: [2024-06-19] + # Updated to use (more accurate) calculation according to + # `num_floating_point_operations` from NVIDIA/Megatron-LM + num_flop_lm = num_floating_point_operations(args, batch_size) + num_flop_per_sec_lm = num_flop_lm / elapsed_time_per_iteration + tflops_lm = num_flop_per_sec_lm / (10**12) + tflops_lm_per_gpu = tflops_lm / args.world_size + wandb_metrics |= { + "throughput/iteration-time": elapsed_time_per_iteration, # 1000 ms / s + "throughput/samples_per_sec": samples_per_sec, + "throughput/samples_per_sec_per_replica": samples_per_sec_per_replica, + "throughput/tokens_per_sec": tokens_per_sec, + "throughput/tokens_per_sec_per_replica": tokens_per_sec_per_replica, + "throughput/tokens_per_gpu_per_sec": tokens_per_gpu_per_second, + "throughput/tokens_per_gpu_per_sec_per_replica": tokens_per_gpu_per_second_per_replica, + "throughput/tflops": tflops, + "throughput/tflops-new": num_flop_lm / elapsed_time_per_iteration, + "throughput/tflops-lm": tflops_lm_per_gpu, + "throughput/approx_params_in_billions": approx_parameters_in_billions, + "throughput/elapsed_ms_per_iteration": elapsed_time_per_iteration, + "throughput/iteration": iteration, + } + if loss_dict is not None: + wandb_metrics |= { + "loss/iteration": iteration, + **{f"loss/{k}": v for k, v in loss_dict.items()}, + } + if writer and args.log_timers_to_tensorboard: + writer.add_scalar( + "iteration-time/iteration-time", elapsed_time_per_iteration, iteration + ) + writer.add_scalar( + "iteration-time/iteration-time vs samples", + elapsed_time_per_iteration, + args.consumed_train_samples, + ) + writer.add_scalar( + "iteration-time/iteration-time vs tokens", + elapsed_time_per_iteration, + args.consumed_train_tokens, + ) + # metrics_to_log = { + # 'iteration': iteration, + # 'train_iters': args.train_iters, + # 'consumed_samples': args.consumed_train_samples, + # 'consumed_tokens': args.consumed_tokens, + # } + log_string = f" iteration={iteration:8d}/{args.train_iters:8d} |" + # .format( iteration, args.train_iters) + log_string += ( + f" consumed_samples={args.consumed_train_samples:12d} |" + # .format(args.consumed_train_samples) + ) + log_string += f" consumed_tokens={args.consumed_train_tokens:12d} |" + # .format( args.consumed_train_tokens) + log_string += ( + " elapsed_time_per_iteration_ms=" + f"{elapsed_time_per_iteration * 1000.0:.1f} |" + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f" learning_rate={learning_rate:.6g} |" + log_string += f" global_batch_size={batch_size:5d} |" + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + "training/iteration": iteration, + "training/iteration_time": elapsed_time_per_iteration, + "training/iteration_time_vs_tokens": ( + elapsed_time_per_iteration / args.consumed_train_tokens + ), + "training/iteration_time_vs_samples": ( + (elapsed_time_per_iteration / args.consumed_train_samples), + ), + "training/consumed_samples": args.consumed_train_samples, + "training/consumed_tokens": args.consumed_train_tokens, + } + for key in total_loss_dict: + if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: + avg = total_loss_dict[key].item() / float( + max(1, total_loss_dict[advanced_iters_key]) + ) + if avg > 0.0: + log_string += " {}={:.6f} |".format(key, avg) + total_loss_dict[key] = accelerator.FloatTensor([0.0]) + if loss_scale is not None: + log_string += " loss_scale={:.1f} |".format(loss_scale) + wandb_metrics |= {"loss/loss_scale": loss_scale} + if grad_norm is not None: + log_string += " grad_norm={:.3f} |".format(grad_norm) + wandb_metrics |= {"loss/grad_norm": grad_norm} + if num_zeros_in_grad is not None: + log_string += " num_zeros={:.1f} |".format(num_zeros_in_grad) + wandb_metrics |= {"loss/num_zeros_in_grad": num_zeros_in_grad} + if params_norm is not None: + log_string += " params_norm={:.3f} |".format(params_norm) + wandb_metrics |= {"loss/params_norm": params_norm} + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + log_string += " curriculum_seqlen={:5d} |".format(args.curriculum_seqlen) + if args.random_ltd: + log_string += " random_ltd reserved_length={:5d} |".format( + args.random_ltd_reserved_length + ) + # log_string += " | ".join([ + # f"{seq_len=:5d} ", + # f"{}" + # f"number_of_skipped_iterations={:3d}", + # + # ]) + log_string += " actual_seqlen={:5d} |".format(seq_len) + log_string += " number_of_skipped_iterations={:3d} |".format( + total_loss_dict[skipped_iters_key] + ) + log_string += " number_of_nan_iterations={:3d} |".format( + total_loss_dict[nan_iters_key] + ) + log_string += " samples_per_second={:.3f} |".format(samples_per_sec) + log_string += " tokens_per_gpu_per_second_tgs={:.3f} |".format( + tokens_per_gpu_per_second + ) + log_string += " [LM]TFLOPs={:.2f} |".format(tflops_lm_per_gpu) + log_string += " [DS]TFLOPs={:.2f} |".format(tflops) + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb_metrics |= { + "training/skiped_iterations": total_loss_dict[skipped_iters_key] + } + wandb_metrics |= {"training/nan_iterations": total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + total_loss_dict[advanced_iters_key] = 0 + total_loss_dict[skipped_iters_key] = 0 + total_loss_dict[nan_iters_key] = 0 + # print_rank_last(log_string) + log.info(log_string) + if report_memory_flag and learning_rate > 0.0: + # Report memory after optimizer state has been initialized. + report_memory("(after {} iterations)".format(iteration)) + report_memory_flag = False + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) + + return report_memory_flag diff --git a/megatron/training_log_alcf.py b/megatron/training_log_alcf.py new file mode 100644 index 0000000000..dcd872971d --- /dev/null +++ b/megatron/training_log_alcf.py @@ -0,0 +1,725 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain utilities.""" + +from enum import Enum + +# from deepspeed.accelerator import get_accelerator +# from deepspeed.compression.compress import redundancy_clean +import torch +import os +import logging + +from megatron import get_args +from megatron import get_timers +from megatron import get_tensorboard_writer +from megatron import get_wandb_writer +from megatron import get_num_microbatches +from megatron.core import mpu + +# from megatron import is_rank_0, print_rank_0 +# from megatron import print_rank_last +# from megatron.arguments import core_transformer_config_from_args +# from megatron.checkpointing import load_checkpoint +# from megatron.checkpointing import save_checkpoint +# from megatron.core import mpu, tensor_parallel +# from megatron.core.enums import ModelType +# from megatron.core.pipeline_parallel import get_forward_backward_func +# from megatron.data.data_samplers import build_pretraining_data_loader +# from megatron.initialize import initialize_megatron +# from megatron.initialize import write_args_to_tensorboard +# from megatron.initialize import set_jit_fusion_options +# from megatron.model import Float16Module +# from megatron.model import GPTModel +# from megatron.model import DistributedDataParallel as LocalDDP +# from megatron.model.transformer import ParallelTransformerLayer +# from megatron.model.vision.knn_monitor import compute_feature_bank +# from megatron.optimizer import get_megatron_optimizer +# from megatron.optimizer_param_scheduler import OptimizerParamScheduler +# from megatron.profiler import on_step_begin, on_step_end, setup_profiler, trigger +# from megatron.utils import check_adlr_autoresume_termination +# from megatron.utils import found_kill_switch, unwrap_model +import ezpz as ez + +# from megatron.utils import calc_params_l2_norm +from megatron.utils import ( + # checkpoint_throughput_calculator, + report_memory, + throughput_calculator, + # update_rotary_pos_emb, +) + +try: + import wandb +except (ImportError, ModuleNotFoundError): + wandb = None +# The earliest we can measure the start time. +# _TRAIN_START_TIME = time.time() + + +log = logging.getLogger(__name__) + + +class InteropLoggingTool(Enum): + TENSORBOARD = 1 + WANDB = 2 + + +RANK: int = ez.get_rank() +LOCAL_RANK: int = ez.get_local_rank() +WORLD_SIZE: int = ez.get_world_size() +DEVICE_TYPE: str = ez.dist.get_torch_device_type() +DEVICE_ID: str = f"{DEVICE_TYPE}:{LOCAL_RANK}" +DEVICE: torch.device = torch.device(DEVICE_TYPE) + +log: logging.Logger = logging.getLogger(__name__) +LOG_LEVEL: str = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) + + +def training_log( + loss_dict, + total_loss_dict, + learning_rate, + iteration, + loss_scale, + report_memory_flag, + skipped_iter, + grad_norm, + params_norm, + num_zeros_in_grad, + model=None, + optimizer=None, +): + """Log training information such as losses, timing, ....""" + args = get_args() + timers = get_timers() + writer = get_tensorboard_writer() + assert args is not None and timers is not None + wandb_metrics = {} + # Advanced, skipped, and Nan iterations. + advanced_iters_key = "advanced iterations" + skipped_iters_key = "skipped iterations" + nan_iters_key = "nan iterations" + # Advanced iterations. + if not skipped_iter: + total_loss_dict[advanced_iters_key] = ( + total_loss_dict.get(advanced_iters_key, 0) + 1 + ) + else: + if advanced_iters_key not in total_loss_dict: + total_loss_dict[advanced_iters_key] = 0 + # Skipped iterations. + total_loss_dict[skipped_iters_key] = ( + total_loss_dict.get(skipped_iters_key, 0) + skipped_iter + ) + # Update losses and set nan iterations + got_nan = False + _zero = torch.tensor([0.0]).to(DEVICE) + for key in loss_dict: + if not skipped_iter: + total_loss_dict[key] = total_loss_dict.get(key, _zero) + loss_dict[key] + else: + value = loss_dict[key].float().sum().item() + is_nan = value == float("inf") or value == -float("inf") or value != value + got_nan = got_nan or is_nan + total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int( + got_nan + ) + + # Logging. + timers_to_log = [ + "forward-backward", + "forward-compute", + "backward-compute", + "batch-generator", + "forward-recv", + "forward-send", + "backward-recv", + "backward-send", + "forward-send-forward-recv", + "forward-send-backward-recv", + "backward-send-forward-recv", + "backward-send-backward-recv", + "forward-backward-send-forward-backward-recv", + "layernorm-grads-all-reduce", + "embedding-grads-all-reduce", + "grads-all-reduce", + "grads-reduce-scatter", + "params-all-gather", + "optimizer-copy-to-main-grad", + "optimizer-unscale-and-check-inf", + "optimizer-clip-main-grad", + "optimizer-count-zeros", + "optimizer-inner-step", + "optimizer-copy-main-to-model-params", + "optimizer", + ] + + # Calculate batch size. + batch_size = ( + args.micro_batch_size * args.data_parallel_size * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + total_loss_dict[skipped_iters_key] + ) + + # Tensorboard values. + # Timer requires all the ranks to call. + if args.log_timers_to_tensorboard and ( + iteration % args.tensorboard_log_interval == 0 and writer is not None + ): + timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) + if writer and (iteration % args.tensorboard_log_interval == 0): + writer.add_scalar( + "steps-vs-samples/y=steps,x=samples", iteration, args.consumed_train_samples + ) + writer.add_scalar( + "steps-vs-samples/y=samples,x=steps", args.consumed_train_samples, iteration + ) + writer.add_scalar( + "steps-vs-tokens/y=steps,x=tokens", iteration, args.consumed_train_tokens + ) + writer.add_scalar( + "steps-vs-tokens/y=tokens,x=steps", args.consumed_train_tokens, iteration + ) + if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + "learning-rate/iteration": iteration, + "learning-rate/learning-rate": learning_rate, + } + writer.add_scalar("learning-rate/learning-rate", learning_rate, iteration) + writer.add_scalar( + "learning-rate/learning-rate vs samples", + learning_rate, + args.consumed_train_samples, + ) + writer.add_scalar( + "learning-rate/learning-rate vs tokens", + learning_rate, + args.consumed_train_tokens, + ) + if args.log_batch_size_to_tensorboard: + writer.add_scalar("batch-size/batch-size", batch_size, iteration) + writer.add_scalar( + "batch-size/batch-size vs samples", + batch_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "batch-size/batch-size vs tokens", + batch_size, + args.consumed_train_tokens, + ) + wandb_metrics |= { + "lm-loss-training/iteration": iteration, + "lm-loss-training/consumed_train_tokens": args.consumed_train_tokens, + } + for key in loss_dict: + wandb_metrics |= {f"lm-loss-training/{key}": loss_dict[key]} + writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs samples", + loss_dict[key], + args.consumed_train_samples, + ) + writer.add_scalar( + f"lm-loss-training/{key}" + " vs tokens", + loss_dict[key], + args.consumed_train_tokens, + ) + if args.fp16 and loss_scale and args.log_loss_scale_to_tensorboard: + writer.add_scalar("loss-scale/loss-scale", loss_scale, iteration) + writer.add_scalar( + "loss-scale/loss-scale vs samples", + loss_scale, + args.consumed_train_samples, + ) + writer.add_scalar( + "loss-scale/loss-scale vs tokens", + loss_scale, + args.consumed_train_tokens, + ) + if args.log_world_size_to_tensorboard: + writer.add_scalar("world-size/world-size", args.world_size, iteration) + writer.add_scalar( + "world-size/world-size vs samples", + args.world_size, + args.consumed_train_samples, + ) + writer.add_scalar( + "world-size/world-size vs tokens", + args.world_size, + args.consumed_train_tokens, + ) + if grad_norm is not None: + wandb_metrics |= {"training/grad-norm": grad_norm} + writer.add_scalar("grad-norm/grad-norm", grad_norm, iteration) + writer.add_scalar( + "grad-norm/grad-norm vs samples", grad_norm, args.consumed_train_samples + ) + writer.add_scalar( + "grad-norm/grad-norm vs tokens", grad_norm, args.consumed_train_tokens + ) + if num_zeros_in_grad is not None: + wandb_metrics |= {"training/num-zeros": num_zeros_in_grad} + writer.add_scalar("num-zeros/num-zeros", num_zeros_in_grad, iteration) + writer.add_scalar( + "num-zeros/num-zeros vs samples", + num_zeros_in_grad, + args.consumed_train_samples, + ) + writer.add_scalar( + "num-zeros/num-zeros vs tokens", + num_zeros_in_grad, + args.consumed_train_tokens, + ) + if params_norm is not None: + wandb_metrics |= {"training/params-norm": params_norm} + writer.add_scalar("params-norm/params-norm", params_norm, iteration) + writer.add_scalar( + "params-norm/params-norm vs samples", + params_norm, + args.consumed_train_samples, + ) + writer.add_scalar( + "params-norm/params-norm vs tokens", + params_norm, + args.consumed_train_tokens, + ) + if hasattr(args, "actual_seq_length"): + writer.add_scalar( + "seqlen/actual_seq_length", args.actual_seq_length, iteration + ) + writer.add_scalar( + "seqlen/actual_seq_length vs samples", + args.actual_seq_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/actual_seq_length vs tokens", + args.actual_seq_length, + args.consumed_train_tokens, + ) + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + writer.add_scalar( + "seqlen/curriculum_seqlen", args.curriculum_seqlen, iteration + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs samples", + args.curriculum_seqlen, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/curriculum_seqlen vs tokens", + args.curriculum_seqlen, + args.consumed_train_tokens, + ) + if args.random_ltd: + writer.add_scalar( + "seqlen/random_ltd_reserved_length", + args.random_ltd_reserved_length, + iteration, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs samples", + args.random_ltd_reserved_length, + args.consumed_train_samples, + ) + writer.add_scalar( + "seqlen/random_ltd_reserved_length vs tokens", + args.random_ltd_reserved_length, + args.consumed_train_tokens, + ) + if args.log_memory_to_tensorboard: + mem_stats = torch.cuda.memory_stats() + writer.add_scalar( + "mem-reserved-bytes", + mem_stats["reserved_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-bytes", + mem_stats["allocated_bytes.all.current"], + iteration, + ) + writer.add_scalar( + "mem-allocated-count", + mem_stats["allocation.all.current"], + iteration, + ) + if iteration % args.tensorboard_log_interval == 0: + # This logging write various optimizer states to tensorboard. This + # feature may consume extra GPU memory thus is set at false by default. + if args.log_optimizer_states_to_tensorboard and optimizer is not None: + opt_stats = [0.0] * 8 + opt_stats_2 = [0.0] * 4 + for _, group in enumerate(optimizer.param_groups): + for _, param in enumerate(group["params"]): + state_param = getattr(optimizer, "state", None) + if state_param is not None: + exp_avg_sq = state_param.get("exp_avg_sq", torch.tensor(0.0)) + exp_avg = state_param.get("exp_avg", torch.tensor(0.0)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()), + ) + opt_stats_2[1] = max( + opt_stats_2[1], exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()), + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()), + ) + if args.zero_stage > 0: + # ZeRO partiions optimizer states + # opt_stats = get_accelerator().FloatTensor(opt_stats) + opt_stats = torch.tensor(opt_stats).to(DEVICE) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_sequence_data_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_sequence_data_parallel_group(), + ) + + if args.tensor_model_parallel_size > 1: + opt_stats = torch.tensor(opt_stats).to(DEVICE) + # opt_stats = get_accelerator().FloatTensor(opt_stats) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_tensor_model_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_tensor_model_parallel_group(), + ) + + if args.pipeline_model_parallel_size > 1: + # opt_stats = get_accelerator().FloatTensor(opt_stats) + opt_stats = torch.tensor(opt_stats).to(DEVICE) + torch.distributed.all_reduce( + opt_stats, group=mpu.get_pipeline_model_parallel_group() + ) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + opt_stats_2 = torch.tensor(opt_stats_2).to(DEVICE) + torch.distributed.all_reduce( + opt_stats_2, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_pipeline_model_parallel_group(), + ) + + wandb_metrics |= { + "optimizer/learning_rate": learning_rate, + "optimizer/iteration": args.iteration, + "optimizer/consumed_train_tokens": args.consumed_train_tokens, + "optimizer/variance_l2": opt_stats[0] ** 0.5, + "optimizer/variance_sqrt_l2": opt_stats[1] ** 0.5, + "optimizer/momentum_l2": opt_stats[2] ** 0.5, + "optimizer/weight_l2": opt_stats[3] ** 0.5, + "optimizer/variance_l1": opt_stats[4], + "optimizer/variance_sqrt_l1": opt_stats[5], + "optimizer/momentum_l1": opt_stats[6], + "optimizer/weight_l1": opt_stats[7], + "optimizer/variance_abs_max": opt_stats_2[0], + "optimizer/variance_sqrt_abs_max": opt_stats_2[1], + "optimizer/momentum_abs_max": opt_stats_2[2], + "optimizer/weight_abs_max": opt_stats_2[3], + } + # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) + # if writer and is_last_rank(): + if writer is not None and RANK == 0: + writer.add_scalar( + "optimizer/variance_l2 vs tokens", + opt_stats[0] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2 vs tokens", + opt_stats[1] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l2 vs tokens", + opt_stats[2] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l2 vs tokens", + opt_stats[3] ** 0.5, + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l1 vs tokens", + opt_stats[4], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_l1 vs tokens", + opt_stats[5], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_l1 vs tokens", + opt_stats[6], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_l1 vs tokens", + opt_stats[7], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_abs_max vs tokens", + opt_stats_2[0], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max vs tokens", + opt_stats_2[1], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/momentum_abs_max vs tokens", + opt_stats_2[2], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/weight_abs_max vs tokens", + opt_stats_2[3], + args.consumed_train_tokens, + ) + writer.add_scalar( + "optimizer/variance_l2", opt_stats[0] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_l2", opt_stats[1] ** 0.5, iteration + ) + writer.add_scalar( + "optimizer/momentum_l2", opt_stats[2] ** 0.5, iteration + ) + writer.add_scalar("optimizer/weight_l2", opt_stats[3] ** 0.5, iteration) + writer.add_scalar("optimizer/variance_l1", opt_stats[4], iteration) + writer.add_scalar("optimizer/variance_sqrt_l1", opt_stats[5], iteration) + writer.add_scalar("optimizer/momentum_l1", opt_stats[6], iteration) + writer.add_scalar("optimizer/weight_l1", opt_stats[7], iteration) + writer.add_scalar( + "optimizer/variance_abs_max", opt_stats_2[0], iteration + ) + writer.add_scalar( + "optimizer/variance_sqrt_abs_max", opt_stats_2[1], iteration + ) + writer.add_scalar( + "optimizer/momentum_abs_max", opt_stats_2[2], iteration + ) + writer.add_scalar("optimizer/weight_abs_max", opt_stats_2[3], iteration) + + assert args is not None + assert timers is not None + if iteration % args.log_interval == 0: + elapsed_time = timers("interval-time").elapsed(barrier=True) + elapsed_time_per_iteration = elapsed_time / total_iterations + seq_len = args.seq_length + if hasattr(args, "actual_seq_length"): + seq_len = args.actual_seq_length + samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator( + model, args, elapsed_time, total_iterations + ) + samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + tokens_per_sec = samples_per_sec * seq_len + tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + tokens_per_gpu_per_second = tokens_per_sec / args.world_size + tokens_per_gpu_per_second_per_replica = ( + tokens_per_gpu_per_second / args.data_parallel_size + ) + # NOTE: [2024-06-19] + # Updated to use (more accurate) calculation according to + # `num_floating_point_operations` from NVIDIA/Megatron-LM + num_flop_lm = num_floating_point_operations(args, batch_size) + num_flop_per_sec_lm = num_flop_lm / elapsed_time_per_iteration + tflops_lm = num_flop_per_sec_lm / (10**12) + tflops_lm_per_gpu = tflops_lm / args.world_size + wandb_metrics |= { + "throughput/iteration-time": elapsed_time_per_iteration, # 1000 ms / s + "throughput/samples_per_sec": samples_per_sec, + "throughput/samples_per_sec_per_replica": samples_per_sec_per_replica, + "throughput/tokens_per_sec": tokens_per_sec, + "throughput/tokens_per_sec_per_replica": tokens_per_sec_per_replica, + "throughput/tokens_per_gpu_per_sec": tokens_per_gpu_per_second, + "throughput/tokens_per_gpu_per_sec_per_replica": tokens_per_gpu_per_second_per_replica, + "throughput/tflops": tflops, + "throughput/tflops-new": num_flop_lm / elapsed_time_per_iteration, + "throughput/tflops-lm": tflops_lm_per_gpu, + "throughput/approx_params_in_billions": approx_parameters_in_billions, + "throughput/elapsed_ms_per_iteration": elapsed_time_per_iteration, + "throughput/iteration": iteration, + } + if loss_dict is not None: + wandb_metrics |= { + "loss/iteration": iteration, + **{f"loss/{k}": v for k, v in loss_dict.items()}, + } + if writer and args.log_timers_to_tensorboard: + writer.add_scalar( + "iteration-time/iteration-time", elapsed_time_per_iteration, iteration + ) + writer.add_scalar( + "iteration-time/iteration-time vs samples", + elapsed_time_per_iteration, + args.consumed_train_samples, + ) + writer.add_scalar( + "iteration-time/iteration-time vs tokens", + elapsed_time_per_iteration, + args.consumed_train_tokens, + ) + # metrics_to_log = { + # 'iteration': iteration, + # 'train_iters': args.train_iters, + # 'consumed_samples': args.consumed_train_samples, + # 'consumed_tokens': args.consumed_tokens, + # } + log_string = f" iteration={iteration:8d}/{args.train_iters:8d} |" + # .format( iteration, args.train_iters) + log_string += ( + f" consumed_samples={args.consumed_train_samples:12d} |" + # .format(args.consumed_train_samples) + ) + log_string += f" consumed_tokens={args.consumed_train_tokens:12d} |" + # .format( args.consumed_train_tokens) + log_string += ( + " elapsed_time_per_iteration_ms=" + f"{elapsed_time_per_iteration * 1000.0:.1f} |" + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f" learning_rate={learning_rate:.6g} |" + log_string += f" global_batch_size={batch_size:5d} |" + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + "training/iteration": iteration, + "training/iteration_time": elapsed_time_per_iteration, + "training/iteration_time_vs_tokens": ( + elapsed_time_per_iteration / args.consumed_train_tokens + ), + "training/iteration_time_vs_samples": ( + (elapsed_time_per_iteration / args.consumed_train_samples), + ), + "training/consumed_samples": args.consumed_train_samples, + "training/consumed_tokens": args.consumed_train_tokens, + } + for key in total_loss_dict: + if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: + avg = total_loss_dict[key].item() / float( + max(1, total_loss_dict[advanced_iters_key]) + ) + if avg > 0.0: + log_string += " {}={:.6f} |".format(key, avg) + total_loss_dict[key] = torch.tensor([0.0]).to(DEVICE) + if loss_scale is not None: + log_string += " loss_scale={:.1f} |".format(loss_scale) + wandb_metrics |= {"loss/loss_scale": loss_scale} + if grad_norm is not None: + log_string += " grad_norm={:.3f} |".format(grad_norm) + wandb_metrics |= {"loss/grad_norm": grad_norm} + if num_zeros_in_grad is not None: + log_string += " num_zeros={:.1f} |".format(num_zeros_in_grad) + wandb_metrics |= {"loss/num_zeros_in_grad": num_zeros_in_grad} + if params_norm is not None: + log_string += " params_norm={:.3f} |".format(params_norm) + wandb_metrics |= {"loss/params_norm": params_norm} + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + log_string += " curriculum_seqlen={:5d} |".format(args.curriculum_seqlen) + if args.random_ltd: + log_string += " random_ltd reserved_length={:5d} |".format( + args.random_ltd_reserved_length + ) + # log_string += " | ".join([ + # f"{seq_len=:5d} ", + # f"{}" + # f"number_of_skipped_iterations={:3d}", + # + # ]) + log_string += " actual_seqlen={:5d} |".format(seq_len) + log_string += " number_of_skipped_iterations={:3d} |".format( + total_loss_dict[skipped_iters_key] + ) + log_string += " number_of_nan_iterations={:3d} |".format( + total_loss_dict[nan_iters_key] + ) + log_string += " samples_per_second={:.3f} |".format(samples_per_sec) + log_string += " tokens_per_gpu_per_second_tgs={:.3f} |".format( + tokens_per_gpu_per_second + ) + log_string += " [LM]TFLOPs={:.2f} |".format(tflops_lm_per_gpu) + log_string += " [DS]TFLOPs={:.2f} |".format(tflops) + total_loss_dict[advanced_iters_key] = 0 + total_loss_dict[skipped_iters_key] = 0 + total_loss_dict[nan_iters_key] = 0 + # print_rank_last(log_string) + log.info(log_string) + if report_memory_flag and learning_rate > 0.0: + # Report memory after optimizer state has been initialized. + report_memory("(after {} iterations)".format(iteration)) + report_memory_flag = False + if wandb is not None and getattr(wandb, "run", None) is not None: + wandb_metrics |= { + "training/skiped_iterations": total_loss_dict[skipped_iters_key] + } + wandb_metrics |= {"training/nan_iterations": total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) + + return report_memory_flag diff --git a/megatron/utils.py b/megatron/utils.py index 15160b1644..dc1dea0b3a 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -1,34 +1,158 @@ -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. """General utilities.""" import sys import os +import logging +from typing import Optional import torch from torch.nn.parallel import DistributedDataParallel as torchDDP from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.multi_tensor_apply import multi_tensor_applier - import amp_C - -from megatron import ( - get_args, - get_adlr_autoresume, - get_num_microbatches -) + +from megatron import get_args, get_adlr_autoresume, get_num_microbatches from megatron.core import mpu from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate from megatron.model.module import param_is_not_shared from megatron.model.rotary_pos_embedding import RotaryEmbedding +import ezpz as ez + +ACCELERATOR = get_accelerator() +assert ACCELERATOR is not None + +if ACCELERATOR.device_name() == "cuda": + try: + from apex.multi_tensor_apply import multi_tensor_applier # type:ignore + import amp_C # type:ignore + + HAS_APEX = True + except Exception: + HAS_APEX = False + +RANK = ez.get_rank() +log = logging.getLogger(__name__) +log.setLevel(os.environ.get("LOG_LEVEL", ("INFO" if RANK == 0 else "CRITICAL"))) + +_DLIO_PROFILER_EXIST = True +_DFTRACER_EXIST = True + +try: + import dftracer # type:ignore +except Exception: + _DFTRACER_EXIST = False + +try: + import dlio_profiler # type:ignore +except Exception: + _DLIO_PROFILER_EXIST = False + + +if _DFTRACER_EXIST: + from dftracer.logger import ( # type:ignore + dftracer as PerfTrace, + dft_fn as Profile, + DFTRACER_ENABLE as DFTRACER_ENABLE, + ) +elif _DLIO_PROFILER_EXIST: + from dlio_profiler.logger import fn_interceptor as Profile # type:ignore + from dlio_profiler.logger import dlio_logger as PerfTrace # type:ignore +else: + from functools import wraps + + class Profile(object): + def __init__( + self, cat, name=None, epoch=None, step=None, image_idx=None, image_size=None + ): + return + + def log(self, func): + return func + + def log_init(self, func): + return func + + def iter(self, func, iter_name="step"): + return func + + def __enter__(self): + return + + def __exit__(self, type, value, traceback): + return + + def update( + self, epoch=None, step=None, image_idx=None, image_size=None, args={} + ): + return + + def flush(self): + return + + def reset(self): + return + + def log_static(self, func): + return + + class dftracer(object): + def __init__( + self, + ): + self.type = None + + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): + return + + def get_time(self): + return + + def enter_event(self): + return + + def exit_event(self): + return + + def log_event(self, name, cat, start_time, duration, string_args=None): + return + + def finalize(self): + return + + PerfTrace = dftracer() + DFTRACER_ENABLE = False + + +def get_logger( + name: str, + level: Optional[str] = None, + rank_zero_only: Optional[bool] = True, +) -> logging.Logger: + """Returns a `logging.Logger` object. + + If `rank_zero_only` passed, the level will be set to CRITICAL on all + non-zero ranks (and will be set to `level` on RANK==0). + """ + logger = logging.getLogger(name) + logger.setLevel( + str(level if level is not None else os.environ.get("LOG_LEVEL", "INFO")).upper() + ) + if rank_zero_only and ez.get_rank() != 0: + logger.setLevel("CRITICAL") + return logger + def update_rotary_pos_emb(seq_length): args = get_args() - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + accelerator = get_accelerator() + assert args is not None and accelerator is not None + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -37,7 +161,8 @@ def update_rotary_pos_emb(seq_length): # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta)(seq_length).to( - get_accelerator().current_device_name()) + accelerator.current_device_name() + ) args.rotary_pos_emb = rotary_pos_emb @@ -57,8 +182,9 @@ def unwrap_model(model, module_instances=(torchDDP)): def calc_params_l2_norm(model): - """Calculate l2 norm of parameters """ + """Calculate l2 norm of parameters""" args = get_args() + assert args is not None if not isinstance(model, list): model = [model] # Remove duplicate params. @@ -74,82 +200,84 @@ def calc_params_l2_norm(model): params_data.append(param.data) # Calculate norm dummy_overflow_buf = get_accelerator().IntTensor([0]) - - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == "cuda" and HAS_APEX: norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], - False # no per-parameter norm + False, # no per-parameter norm ) - else : - norm = torch.norm(params_data,p=2.0) + else: + norm = torch.norm(params_data, p=2.0) norm_2 = norm * norm # Sum across all model-parallel GPUs. - torch.distributed.all_reduce(norm_2, - op=torch.distributed.ReduceOp.SUM, - group=mpu.get_model_parallel_group()) + torch.distributed.all_reduce( + norm_2, op=torch.distributed.ReduceOp.SUM, group=mpu.get_model_parallel_group() + ) return norm_2.item() ** 0.5 def average_losses_across_data_parallel_group(losses): """Reduce a tensor of losses across all GPUs.""" - averaged_losses = torch.cat( - [loss.clone().detach().view(1) for loss in losses]) - torch.distributed.all_reduce(averaged_losses, - group=mpu.get_data_parallel_group()) - averaged_losses = averaged_losses / \ - torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) + averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses]) + torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group()) + averaged_losses = averaged_losses / torch.distributed.get_world_size( + group=mpu.get_data_parallel_group() + ) return averaged_losses def report_memory(name): """Simple GPU memory report.""" + accelerator = get_accelerator() + assert accelerator is not None mega_bytes = 1024.0 * 1024.0 - string = name + ' memory (MB)' - string += ' | allocated: {}'.format( - get_accelerator().memory_allocated() / mega_bytes) - string += ' | max allocated: {}'.format( - get_accelerator().max_memory_allocated() / mega_bytes) - string += ' | reserved: {}'.format( - get_accelerator().memory_reserved() / mega_bytes) - string += ' | max reserved: {}'.format( - get_accelerator().max_memory_reserved() / mega_bytes) + string = name + " memory (MB)" + string += " | allocated: {}".format(accelerator.memory_allocated() / mega_bytes) + string += " | max allocated: {}".format( + accelerator.max_memory_allocated() / mega_bytes + ) + reserved = accelerator.memory_reserved() + max_reserved = accelerator.max_memory_reserved() + if reserved is not None: + string += " | reserved: {}".format(reserved / mega_bytes) + if max_reserved is not None: + string += " | max reserved: {}".format(max_reserved / mega_bytes) if mpu.get_data_parallel_rank() == 0: - print("[Rank {}] {}".format(torch.distributed.get_rank(), string), - flush=True) + log.info(f"[Rank {RANK}] {string}") def print_params_min_max_norm(optimizer, iteration): """Print min, max, and norm of all parameters.""" index = 0 rank = torch.distributed.get_rank() - string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n' + string = "iteration, rank, index, tensor-model-parallel, min, max, norm\n" optimizer_ = optimizer.optimizer for param_group in optimizer_.param_groups: - for param in param_group['params']: + for param in param_group["params"]: index += 1 min_ = param.data.min() max_ = param.data.max() norm = torch.linalg.norm(param.data) - string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format( - iteration, rank, index, int(param.tensor_model_parallel)) - string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm) - print(string, flush=True) + string += "{:7d}, {:4d}, {:4d}, {:2d}, ".format( + iteration, rank, index, int(param.tensor_model_parallel) + ) + string += "{:.6E}, {:.6E}, {:.6E}\n".format(min_, max_, norm) + log.info(string) -def check_adlr_autoresume_termination(iteration, model, - optimizer, opt_param_scheduler): +def check_adlr_autoresume_termination(iteration, model, optimizer, opt_param_scheduler): """Check for autoresume signal and exit if it is received.""" from megatron.checkpointing import save_checkpoint args = get_args() + assert args is not None autoresume = get_adlr_autoresume() # Add barrier to ensure consistnecy. torch.distributed.barrier() - if autoresume.termination_requested(): + if autoresume is not None and autoresume.termination_requested(): if args.save: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) print_rank_0(">>> autoresume termination request found!") @@ -159,12 +287,14 @@ def check_adlr_autoresume_termination(iteration, model, sys.exit(0) -def get_ltor_masks_and_position_ids(data, - eod_token, - reset_position_ids, - reset_attention_mask, - eod_mask_loss, - skip_mask=False): +def get_ltor_masks_and_position_ids( + data, + eod_token, + reset_position_ids, + reset_attention_mask, + eod_mask_loss, + skip_mask=False, +): """Build masks and position id for left to right model.""" # Extract batch size and sequence length. @@ -178,8 +308,9 @@ def get_ltor_masks_and_position_ids(data, attention_mask = None if not skip_mask: - attention_mask = torch.tril(torch.ones( - (att_mask_batch, seq_length, seq_length), device=data.device)).view(att_mask_batch, 1, seq_length, seq_length) + attention_mask = torch.tril( + torch.ones((att_mask_batch, seq_length, seq_length), device=data.device) + ).view(att_mask_batch, 1, seq_length, seq_length) # Loss mask. loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) @@ -187,8 +318,7 @@ def get_ltor_masks_and_position_ids(data, loss_mask[data == eod_token] = 0.0 # Position ids. - position_ids = torch.arange(seq_length, dtype=torch.long, - device=data.device) + position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device) position_ids = position_ids.unsqueeze(0).expand_as(data) # We need to clone as the ids will be modifed based on batch index. if reset_position_ids: @@ -197,7 +327,6 @@ def get_ltor_masks_and_position_ids(data, if reset_position_ids or reset_attention_mask: # Loop through the batches: for b in range(micro_batch_size): - # Find indecies where EOD token is. eod_index = position_ids[b, data[b] == eod_token] # Detach indecies from positions if going to modify positions. @@ -209,71 +338,131 @@ def get_ltor_masks_and_position_ids(data, for j in range(eod_index.size()[0]): i = eod_index[j] # Mask attention loss. - if reset_attention_mask and not skip_mask: - attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + if ( + reset_attention_mask + and not skip_mask + and attention_mask is not None + ): + attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0 # Reset positions. if reset_position_ids: - position_ids[b, (i + 1):] -= (i + 1 - prev_index) + position_ids[b, (i + 1) :] -= i + 1 - prev_index prev_index = i + 1 # Convert attention mask to binary: if not skip_mask: - attention_mask = (attention_mask < 0.5) + assert attention_mask is not None + attention_mask = attention_mask < 0.5 return attention_mask, loss_mask, position_ids def print_rank_0(message): """If distributed is initialized, print only on rank 0.""" - if torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - print(message, flush=True) - else: - print(message, flush=True) + # if torch.distributed.is_initialized(): + # if torch.distributed.get_rank() == 0: + # # print(message, flush=True) + # print(message, flush=True) + # else: + # print(message, flush=True) + _ = log.info(f"{message}") if RANK == 0 else None + def is_last_rank(): - return torch.distributed.get_rank() == ( - torch.distributed.get_world_size() - 1) + return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1) + def print_rank_last(message): """If distributed is initialized, print only on last rank.""" if torch.distributed.is_initialized(): if is_last_rank(): - print(message, flush=True) + # print(message, flush=True) + log.info(message) else: - print(message, flush=True) + log.info(message) + def is_aml(): # Are we running inside an Azure Machine Learning (AML) environment? - return 'AZUREML_EXPERIMENT_ID' in os.environ + return "AZUREML_EXPERIMENT_ID" in os.environ + def is_rank_0(): """Check whether it is rank 0. For AML, check if it is rank 0 of a node""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0 or ( - is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0 - ): + is_aml() + and (torch.distributed.get_rank() % get_accelerator().device_count()) == 0 + ): return True else: return False else: return True -def get_parameters_in_billions(model): - gpus_per_model = torch.distributed.get_world_size(group=mpu.get_model_parallel_group()) - approx_parameters_in_billions = sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) - for model_module in model]) +def get_parameters_in_billions(model): + gpus_per_model = torch.distributed.get_world_size( + group=mpu.get_model_parallel_group() + ) + + approx_parameters_in_billions = sum( + [ + sum( + [ + p.ds_numel if hasattr(p, "ds_id") else p.nelement() + for p in model_module.parameters() + ] + ) + for model_module in model + ] + ) + + return approx_parameters_in_billions * gpus_per_model / (1e9) + + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) - return approx_parameters_in_billions*gpus_per_model/(1e9) def throughput_calculator(model, args, iteration_time, total_iterations): - batch_size = args.micro_batch_size * get_num_microbatches() * args.data_parallel_size - approx_parameters_in_billions = None if (model is None) else get_parameters_in_billions(model) - elapsed_time_per_iter = iteration_time/total_iterations + batch_size = ( + args.micro_batch_size * get_num_microbatches() * args.data_parallel_size + ) + approx_parameters_in_billions = ( + None if (model is None) else get_parameters_in_billions(model) + ) + elapsed_time_per_iter = iteration_time / total_iterations samples_per_second = batch_size / elapsed_time_per_iter - #flops calculator + # flops calculator hidden_size = args.hidden_size num_attention_heads = args.num_attention_heads head_dim = hidden_size // num_attention_heads @@ -281,6 +470,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations): num_layers = args.num_layers vocab_size = args.padded_vocab_size gqa = args.num_attention_heads // args.num_key_value_heads + num_experts_routed_to = args.topk ffn_multiplier = 3 if args.swiglu else 2 macs_per_flops = 2 @@ -291,58 +481,91 @@ def throughput_calculator(model, args, iteration_time, total_iterations): seq_len = args.seq_length if hasattr(args, 'actual_seq_length'): seq_len = args.actual_seq_length - - pre_and_post_mha_gemm_macs = batch_size * num_layers * (1 + (2 // gqa) + 1) * (hidden_size**2) * seq_len - mha_bgemm_macs = batch_size * num_layers * 2 * head_dim * num_attention_heads * (seq_len**2) - ffn_gemm_macs = batch_size * num_layers * ffn_multiplier * ffn_hidden_size * hidden_size * seq_len + pre_and_post_mha_gemm_macs = ( + batch_size * num_layers * (1 + (2 // gqa) + 1) * (hidden_size**2) * seq_len + ) + mha_bgemm_macs = ( + batch_size * num_layers * 2 * head_dim * num_attention_heads * (seq_len**2) + ) + ffn_gemm_macs = ( + batch_size + * num_layers + * ffn_multiplier + * ffn_hidden_size + * hidden_size + * seq_len + * num_experts_routed_to + ) logit_lmhead_gemm_macs = batch_size * vocab_size * hidden_size * seq_len - fwd_macs = pre_and_post_mha_gemm_macs + mha_bgemm_macs + ffn_gemm_macs + logit_lmhead_gemm_macs + fwd_macs = ( + pre_and_post_mha_gemm_macs + + mha_bgemm_macs + + ffn_gemm_macs + + logit_lmhead_gemm_macs + ) bwd_macs = 2 * fwd_macs fwd_bwd_macs = fwd_macs + bwd_macs - if (hasattr(args, 'checkpoint_activations') and args.checkpoint_activations) or (hasattr(args, 'recompute_granularity') and args.recompute_granularity == 'full'): + if (hasattr(args, "checkpoint_activations") and args.checkpoint_activations) or ( + hasattr(args, "recompute_granularity") and args.recompute_granularity == "full" + ): fwd_bwd_macs += fwd_macs - if hasattr(args, 'recompute_granularity') and args.recompute_granularity == 'selective': + if ( + hasattr(args, "recompute_granularity") + and args.recompute_granularity == "selective" + ): fwd_bwd_macs += mha_bgemm_macs flops_per_iteration = fwd_bwd_macs * macs_per_flops tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12)) return samples_per_second, tflops, approx_parameters_in_billions + def checkpoint_throughput_calculator(model, latency_second): approx_parameters_in_billions = get_parameters_in_billions(model) - checkpoint_multiplier = 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + checkpoint_multiplier = ( + 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + ) checkpoint_GB = approx_parameters_in_billions * checkpoint_multiplier GB_per_second = checkpoint_GB / latency_second - print_rank_0(f"Checkpoint Save GB: {round(checkpoint_GB, 3)}, GB/Sec: {round(GB_per_second,2)}, Latency(second): {round(latency_second, 3)}") + print_rank_0( + f"Checkpoint Save GB: {round(checkpoint_GB, 3)}, GB/Sec: {round(GB_per_second,2)}, Latency(second): {round(latency_second, 3)}" + ) def get_fingerprint_header(): return f"{'min':^13} {'max':^13} {'mean':^13} {'l2 norm':^12} metadata" + def get_fingerprint(p): return f"{p.min():13.6e} {p.max():13.6e} {p.mean():13.6e} {p.norm():12.6e}" def dump_position_embed_weights(preamble, iteration, model): - # return + # return from deepspeed.utils import safe_get_full_fp32_param + tp_rank = mpu.get_tensor_model_parallel_rank() pp_rank = mpu.get_pipeline_model_parallel_rank() dp_rank = mpu.get_data_parallel_rank() get_fingerprint_header() for n, p in model[0].named_parameters(): - if 'position_embeddings' in n: + if "position_embeddings" in n: tag = "pos_embed" elif "word_embeddings" in n: tag = "word_embed" else: - continue - print(f"iter {iteration} {preamble} {tag} lp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(p)} {p.shape}\n") + continue + log.info( + f"iter {iteration} {preamble} {tag} lp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(p)} {p.shape}\n" + ) fp32_value = safe_get_full_fp32_param(p) - if fp32_value is not None: - print(f"iter {iteration} {preamble} {tag} hp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(fp32_value)} {p.shape}\n") + if fp32_value is not None: + log.info( + f"iter {iteration} {preamble} {tag} hp {tp_rank}/{pp_rank}/{dp_rank}: {get_fingerprint(fp32_value)} {p.shape}\n" + ) + def dump_weights(preamble, iteration, model, optimizer, tensor=None): # return @@ -353,19 +576,19 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None): fn = f"debug-bf16-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" # only care for first and last pp stages and dp0 tp0 - #if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()): + # if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()): # return - #if not (tp_rank == 0 and dp_rank == 0): + # if not (tp_rank == 0 and dp_rank == 0): # return if tensor is not None: orig_tensor = tensor if hasattr(tensor, "_hp_param"): - numel = tensor._hp_param.numel() # // dp_size + numel = tensor._hp_param.numel() # // dp_size tensor = tensor.flatten().narrow(0, 0, numel) - #print(fn) + # print(fn) with open(fn, "w") as fh: fh.write(f"{get_fingerprint_header()}\n") @@ -375,33 +598,30 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None): for n, p in model[0].named_parameters(): fh.write(f"{get_fingerprint(p)} {n} {p.shape}\n") - + # # until we figure out how to dump the actual fp32 values don't do this + # fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" + # with open(fn, "w") as fh: + # fh.write(f"{get_fingerprint_header()}\n") + # if tensor is not None: + # tensor = orig_tensor + # if hasattr(tensor, "_hp_param"): + # fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n") + # #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n") + # else: + # fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") + # #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n") + # + # else: + # if hasattr(model[0].module.tied_modules, "embed"): + # p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param + # fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") return - # until we figure out how to dump the actual fp32 values don't do this - fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" - with open(fn, "w") as fh: - fh.write(f"{get_fingerprint_header()}\n") - if tensor is not None: - tensor = orig_tensor - if hasattr(tensor, "_hp_param"): - fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n") - #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n") - else: - fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") - #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n") - - else: - if hasattr(model[0].module.tied_modules, "embed"): - p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param - fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") - - def found_kill_switch(): args = get_args() + assert args is not None if args.kill_switch_file is not None and os.path.exists(args.kill_switch_file): return True else: return False - diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py new file mode 100644 index 0000000000..3686c6ceeb --- /dev/null +++ b/pretrain_gpt_alcf.py @@ -0,0 +1,631 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import time +from typing import Callable +from mpi4py import MPI + +comm = MPI.COMM_WORLD +comm.Barrier() +python_start_time = time.time() + +import os +from rich import print +import torch +import math +from functools import partial +from megatron import get_args + +# from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import ( + average_losses_across_data_parallel_group, + update_rotary_pos_emb, +) +from megatron.arguments import core_transformer_config_from_args + +# from megatron.utils import Profile, PerfTrace + +import logging + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage + +# from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +from torch import nn +import torch.nn.functional as F +import ezpz as ez + +dt_imports = time.time() - python_start_time +t0_setup = time.time() + +# ---- [SETUP COMMS] ------------------------ +# if str(os.environ.get('LAUNCH_CMD', 'mpich')).lower() == 'mpich': +RANK = ez.setup_torch(backend="deepspeed") # , timeout=7200) +dt_setup = time.time() - t0_setup +# else: +# RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +LOCAL_RANK = ez.get_local_rank() +DEVICE_TYPE = ez.dist.get_torch_device_type() +if torch.cuda.is_available(): + torch.cuda.set_device(LOCAL_RANK) + +log = logging.getLogger(__name__) +LOG_LEVEL = str(os.environ.get("LOG_LEVEL", "INFO")).upper() +# set logging level to "INFO" on RANK == 0, "CRITICAL" on all other ranks +log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL") + +log.info(f"Import python modules in {dt_imports} seconds") +log.info(f"ez.setup_torch time: {dt_setup} seconds") + +# ---- [SETUP WANDB FROM RANK 0] -------------- +WANDB_MODE = os.environ.get("WANDB_MODE", None) +DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == "disabled" +if RANK == 0 and not DISABLE_WANDB: + project_name = os.environ.get( + "WB_PROJECT", # look for WB_PROJECT in env + os.environ.get("WANDB_PROJECT", "AuroraGPT"), # look for WANDB_PROJECT in env + ) + log.info(f"Setting up W&B from: {RANK} with {project_name}") + _ = ez.setup_wandb(project_name=project_name) + + +@ez.dist.timeitlogit(rank=RANK) +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + log.info("building GPT model ...") + see_memory_usage("Before Building Model", force=True) + args = get_args() + assert args is not None + config = core_transformer_config_from_args(args) + # if RANK == 0: + # git_ds_info() + if hasattr(mpu, "get_sequence_data_parallel_group"): + dpg = mpu.get_sequence_data_parallel_group() + elif hasattr(mpu, "get_data_parallel_group"): + dpg = mpu.get_data_parallel_group() + else: + dpg = None + deepspeed_zero_init = deepspeed.zero.Init + if args.use_mics: + deepspeed_zero_init = deepspeed.zero.MiCS_Init + with deepspeed_zero_init( + data_parallel_group=dpg, + remote_device=(None if args.remote_device == "none" else args.remote_device), + config_dict_or_path=args.deepspeed_config, # _dict, + enabled=args.zero_stage == 3, + mpu=mpu, + ): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe(config=config, num_tokentypes=0, parallel_output=True) + # This is a hack to give us a reference to + # get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + # Precompute the attention mask and store it in args. + # This avoids having to pipeline it + # as an activation during training. + # The mask is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=DEVICE_TYPE, + ) + ).view(1, 1, args.seq_length, args.seq_length) + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + # For pretraining, since sequence length is fixed, + # cache rotary embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + log.info(80 * "-") + log.info(f"Number of parameters in model: {num_params}") + log.info(80 * "-") + see_memory_usage("After Building Model", force=True) + if wandb is not None and getattr(wandb, "run", None) is not None: + assert wandb.run is not None + tbdir = args.tensorboard_dir + # tbdir = args.getattr('tensorboard_dir', None) + if tbdir is not None: + try: + log.info(f"Patching tensorboard from {tbdir}") + wandb.tensorboard.patch(root_logdir=tbdir) + except ValueError as exc: + log.exception(exc) + log.warning("Continuing without patching tensorboard!") + wandb.run.config.update({"num_params": num_params}) + if "args" not in wandb.run.config: + log.info( + f"Updating WandB run.config: [{wandb.run.name}]({wandb.run.get_url()})" + ) + try: + wandb.run.config.update({"args": dict(sorted(vars(args).items()))}) + except Exception: + log.error('Unable to `wandb.run.config.update({"args": vars(args)})`') + # try: + # wandb.run.watch( + # model, + # log='all', + # log_graph=True, + # ) + # except Exception: + # pass + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + assert args is not None and tokenizer is not None + # Items and their type. + keys = ["text"] + datatype = torch.int64 + data = next(data_iterator) if data_iterator is not None else None + + if ( + args.iteration < 10 + and RANK == 0 + and os.environ.get("DUMP_TOKENS", None) + and data is not None + ): + log.info(f"{args.iteration=}: {data['text'][:10]=}") + # # Broadcast data. + # if data_iterator is not None: + # data = next(data_iterator) + # else: + # data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + # Get the masks and postition ids. + skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask, + ) + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + assert args is not None + if args.data_efficiency_curriculum_learning: + if "seqlen_truncate" in data_sampler_state_dict["current_difficulties"]: + args.data_efficiency_curriculum_learning_seqlen_type = "seqlen_truncate" + current_seqlen = data_sampler_state_dict["current_difficulties"][ + "seqlen_truncate" + ] + if current_seqlen < args.seq_length: + data["text"] = data["text"][:, : (current_seqlen + 1)].contiguous() + elif "seqlen_reshape" in data_sampler_state_dict["current_difficulties"]: + args.data_efficiency_curriculum_learning_seqlen_type = "seqlen_reshape" + current_seqlen = data_sampler_state_dict["current_difficulties"][ + "seqlen_reshape" + ] + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data["text"]) + reshape_len = (data["text"].size()[1] // (current_seqlen + 1)) * ( + current_seqlen + 1 + ) + data["text"] = torch.cat( + ( + data["text"][:, :reshape_len] + .contiguous() + .view(-1, current_seqlen + 1), + data["text"][:, -(current_seqlen + 1) :], + ), + 0, + ).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen + 1)) + num_row = min(num_row, data["text"].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data["text"] = data["text"][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """ + Modification of `get_batch` to work on `next(data_iterator)` + instead of `data_iterator` + """ + args = get_args() + tokenizer = get_tokenizer() + assert args is not None + # Items and their type. + keys = ["text"] + datatype = torch.int64 + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + ) + if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask + # have size [batch size, seqlen] + tokens = tokens[:, : args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, : args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, : args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + assert args is not None + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: + return loss, { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "mos loss": mos_loss, + } + elif args.kd: + return loss, { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "kd loss": mos_loss, + } + log.info( + f">>> total loss: {loss}, " + f"lm loss {averaged_loss[0]}, " + f"kd loss {mos_loss}" + ) + else: + if max(args.num_experts) <= 1: + return loss, {"lm loss": averaged_loss[0]} + loss = loss + moe_loss + return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + + +def calculate_mos_loss( + args, stu_output, teacher_model, tokens, position_ids, attention_mask +): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + if teacher_model: + with torch.no_grad(): + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + csl = curriculum_seqlen + attention_mask = attention_mask[:, :, :csl, :csl].contiguous() + # No need to truncate labels + # as we do not need it for the teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, position_ids, attention_mask + ) + assert stu_output.size() == tea_output.size(), ( + "teacher and student output should match in size. " + f"Student: {stu_output.size()}, " + f"Teacher: {tea_output.size()}, " + f"CL seq length {args.curriculum_seqlen}" + ) + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + # The target logits is expected to be probabilities. + # If we use log_softmax, + # then we need to set target_log to true + # when initializing the KLDivLoss. + tea_logits = F.softmax(tea_output / kd_temp, dim=2) + mos_loss = ( + kd_temp + * kd_temp + * nn.KLDivLoss(reduction="batchmean")(student_logits, tea_logits) + ) + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model) -> tuple[torch.Tensor | None, Callable]: + """Forward step.""" + args = get_args() + timers = get_timers() + assert args is not None + assert timers is not None + # Get the batch. + timers("batch-generator", log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator) + timers("batch-generator").stop() + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if hasattr(args, "data_efficiency_curriculum_learning_seqlen_type") and ( + args.data_efficiency_curriculum_learning_seqlen_type == "seqlen_reshape" + ): + args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + stu_output = None + if args.mos or args.kd: + # The forward func can return either the loss or the logits, + # depending on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + assert args.curriculum_seqlen is not None + labels = labels[:, : args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), labels + ) + else: + output_tensor, other_losses = model( + tokens, position_ids, attention_mask, labels=labels + ) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask, + ) + + # Output_tensor stores the standard loss, + # loss_func calculates the total loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +@ez.dist.timeitlogit(rank=RANK) +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + t0 = time.perf_counter() + args = get_args() + assert args is not None + # from ezpz.profile import get_context_manager + # cm = get_context_manager(rank=RANK, outdir=args.save) + # with cm: + log.info("> building train, validation, and test datasets for GPT ...") + files = [] + if args.data_file_list is not None: + log.info(f"Reading datasets from {args.data_file_list}") + # [!NOTE]: + # - We expect each line of args.data_file_list to be of the form: + # `weight /path/tp/data_text_document corpus` + # where: + # - `weight` is the relative weight of that document + # across all documents (i.e. lines in `args.data_file_list`) + # - `/path/to/data_text_document` is the path to the text document + # - `corpus` is the corpus (~ source, can be made up) where that + # document came from (i.e. `books`, `arxiv`, etc.) + with open(args.data_file_list, "r") as flist: + for f in flist.readlines(): + if len(f.strip()) != 0: + try: + w, fname, c = f.split() + except Exception as exc: + log.exception(exc) + raise Exception( + "Please provide the file list as " + "'weight, filename, corpus'" + ) + if fname.find(".bin") != -1: + fname = fname.split(".bin")[0] + files.extend([float(w), fname, c]) # weight # filename # corpus + elif len(args.data_path) == 1 and os.path.isdir(args.data_path[0]): + path = args.data_path[0] + "/" + for f in os.listdir(path): + if os.path.isfile(path + f) and f.find(".bin") != -1: + files.append(1) + files.append(path + f.split(".bin")[0]) + else: + files = args.data_path + + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=files, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path, + ) + dt = time.perf_counter_ns() - t0 + log.info(f"> finished creating GPT datasets. Took: {dt:.5f}s") + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen(f"type {cmd}", stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + + +def git_ds_info(): + if RANK != 0: + return + from deepspeed.env_report import main as ds_report + + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists("git"): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode("utf-8").strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode("utf-8").strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print( + f"**** Git info for Megatron: " + f"git_hash={git_hash} git_branch={git_branch} ****" + ) + + +def main(): + if os.getenv("TORCH_PROFILER_ENABLE") == "1": + # record_function + from torch.profiler import profile, ProfilerActivity + + try: + activities = [ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ProfilerActivity.XPU, + ] + except Exception as exc: + log.exception(exc) + log.warning("TORCH PROFILER WARNING: XPU is not supported") + activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA] + with profile(activities=activities) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process, + ) + args = get_args() + assert args is not None + prof.export_chrome_trace( + f"{args.trace_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json" + ) + else: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process, + ) + # try: + # from megatron.text_generation import generate_and_post_process + # with torch.autocast(device_type=DEVICE, dtype=args.dtype): + # response, _, _, _ = generate_and_post_process( + # model, + # prompts=[ + # "Hello world", + # "Nature is", + # "Turing test comprises", + # "Explain solar eclipse" + # ], + # tokens_to_generate=32 + # ) + # if RANK == 0: + # log.info(f'generation completed..\n response:{response}') + # except ValueError as ve: + # log.critical(f'ValueError: {ve}') + # pass + # dist.barrier() + # model.train() + return model + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit(0) diff --git a/pretrain_llama.py b/pretrain_llama.py new file mode 100644 index 0000000000..ab7ffc785c --- /dev/null +++ b/pretrain_llama.py @@ -0,0 +1,589 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import os +import torch +import math + +# import logging + +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from rich import print +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import ( + average_losses_across_data_parallel_group, + update_rotary_pos_emb, +) +from megatron.arguments import core_transformer_config_from_args + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +from torch import nn +import torch.nn.functional as F + +# from ezpz import get_logger +from ezpz.dist import setup_torch, get_world_size, setup_wandb + +RANK = setup_torch( + backend="deepspeed", + port="5432", +) +WORLD_SIZE = get_world_size() +LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" + +WANDB_MODE = os.environ.get("WANDB_MODE", None) +DISABLE_WANDB = ( + WANDB_MODE is not None and str(WANDB_MODE).lower() == "disabled" +) + +if RANK == 0 and not DISABLE_WANDB: + # args = get_args() + # assert args is not None + # tensorboard_dir = args.tensorboard_dir + # if args.tensorboard_dir is not None: + # print(f'Setting (in env): {TENSORBOARD_DIR=}') + # os.environ['TENSORBOARD_DIR'] = args.tensorboard_dir + project_name = os.environ.get( + "WB_PROJECT", + os.environ.get("WANDB_PROJECT", "GenSLM-Megatron-DS"), + ) + print("--------------------------------------------------") + print(f"Setting up W&B from: {RANK} with {project_name}") + print("--------------------------------------------------") + setup_wandb(project_name=project_name) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + print_rank_0("building GPT model ...") + see_memory_usage("Before Building Model", force=True) + args = get_args() + assert args is not None + config = core_transformer_config_from_args(args) + # args = get_args() + # timers = get_timers() + if wandb.run is not None: + print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({"args": vars(args)}) + if RANK == 0: + git_ds_info() + + with deepspeed.zero.Init( + sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), + remote_device=( + None if args.remote_device == "none" + else args.remote_device, + ), + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu, + ): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from + # within training.py We need to call model.set_batch_fn after + # deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Predompute the attention mask and store it in args. This avoids + # having to pipeline it as an activation during training. The mask + # is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=get_accelerator().current_device_name(), + ) + ).view(1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, cache rotary + # embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + # print_rank_0('\n ------------------------ ') + # print_rank_0(f'num of parameters {num_params}') + # print_rank_0('------------------------\n ') + print_rank_0(80 * "-") + print_rank_0(f"Number of parameters in model: {num_params}") + print_rank_0(80 * "-") + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + wandb.run.watch( + model, + log="all", + log_graph=True, + ) + wandb.run.config.update({"num_params": num_params}) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + assert args is not None + assert tokenizer is not None + # Items and their type. + keys = ["text"] + datatype = torch.int64 + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + # Get the masks and postition ids. + skip_mask = ( + hasattr(args, "use_flash_attn") + or hasattr(args, "flash_attn_triton") + ) + # skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask, + ) + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + assert args is not None + if args.data_efficiency_curriculum_learning: + if ( + "seqlen_truncate" in data_sampler_state_dict[ + "current_difficulties" + ] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_truncate" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_truncate" + ] + ) + if current_seqlen < args.seq_length: + data["text"] = ( + data["text"][:, : (current_seqlen + 1)].contiguous() + ) + elif ( + "seqlen_reshape" in data_sampler_state_dict["current_difficulties"] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_reshape" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_reshape" + ] + ) + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data["text"]) + reshape_len = ( + (data["text"].size()[1] // (current_seqlen + 1)) + * (current_seqlen + 1) + ) + data["text"] = torch.cat( + ( + data["text"][:, :reshape_len] + .contiguous() + .view(-1, current_seqlen + 1), + data["text"][:, -(current_seqlen + 1):], + ), + 0, + ).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen + 1)) + num_row = min(num_row, data["text"].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data["text"] = data["text"][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of + `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + assert tokenizer is not None and args is not None + + # Items and their type. + keys = ["text"] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < tokens.size()[1] + ): + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size: + # [batch size, seqlen] + tokens = tokens[:, : args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, : args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, : args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: # type:ignore + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "mos loss": mos_loss, + # } + losses = { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "mos loss": mos_loss, + } + elif args.kd: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "kd loss": mos_loss, + # } + losses = { + "total-loss": loss, + "lm-loss": averaged_loss[0], + "moe-loss": moe_loss, + "kd-loss": mos_loss, + } + print_rank_0( + ">>> total loss: {}, lm loss {}, kd loss {}".format( + loss, averaged_loss[0], mos_loss + ) + ) + else: + if max(args.num_experts) <= 1: # type:ignore + losses = {"lm-loss": averaged_loss[0]} + # return loss, {"lm loss": averaged_loss[0]} + else: + loss = loss + moe_loss + losses = {"lm-loss": averaged_loss[0], "moe loss": moe_loss} + # return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + if wandb is not None and wandb.run is not None: + # wandb.run.log({}) + losses |= {'loss': loss} + wandb.run.log({f"Loss/{k}": v for k, v in losses.items()}) + return loss, losses + + +def calculate_mos_loss( + args, + stu_output, + teacher_model, + tokens, + position_ids, + attention_mask, +): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + attention_mask = attention_mask[ + :, :, :curriculum_seqlen, :curriculum_seqlen + ].contiguous() + # No need to truncate labels as we do not need it for the + # teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, position_ids, attention_mask + ) + assert ( + stu_output.size() == tea_output.size() + ), ( + "teacher and student output should match in size. " + f"Student: {stu_output.size()}, " + f"Teacher: {tea_output.size()}, " + f"CL seq length {args.curriculum_seqlen}" + ) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + tea_logits = F.softmax( + tea_output / kd_temp, dim=2 + ) + # The target logits is expected to be probabilities. If we use + # log_softmax, then we need to set target_log to true when initializing + # the KLDivLoss. + mos_loss = ( + kd_temp + * kd_temp + * nn.KLDivLoss(reduction="batchmean")(student_logits, tea_logits) + ) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + assert timers is not None and args is not None + + # Get the batch. + timers("batch-generator", log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = ( + get_batch(data_iterator) + ) + timers("batch-generator").stop() + + if args.data_efficiency_curriculum_learning: # type: ignore + args.curriculum_seqlen = tokens.size()[1] # type: ignore + if ( + hasattr( + args, + "data_efficiency_curriculum_learning_seqlen_type" + ) + and ( + args.data_efficiency_curriculum_learning_seqlen_type + == "seqlen_reshape" + ) + ): + args.data_efficiency_curriculum_learning_numel = ( + torch.numel(tokens) + ) + + assert args is not None + if args.mos or args.kd: # type:ignore + # The forward func can return either the loss or the logits, depending + # on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if ( + args.curriculum_learning_legacy # type:ignore + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + labels = labels[:, : args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), labels + ) + else: + output_tensor, other_losses = model( + tokens, position_ids, attention_mask, labels=labels + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask, + ) + + # Output_tensor stores the standard loss, loos_func calculates the total + # loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + assert args is not None + print_rank_0( + "> building train, validation, and test datasets " "for GPT ..." + ) + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path, + ) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen( + f"type {cmd}", + stdout=subprocess.PIPE, + shell=True + ) + return result.wait() == 0 + + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists("git"): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode("utf-8").strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode("utf-8").strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print( + f"**** Git info for Megatron: " + f"git_hash={git_hash} git_branch={git_branch} ****" + ) + + +def main(): + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={"tokenizer_type": "GPT2BPETokenizer"}, + data_post_process=data_post_process, + ) + return model + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit() diff --git a/test_agptllama.py b/test_agptllama.py new file mode 100644 index 0000000000..e1d207fa27 --- /dev/null +++ b/test_agptllama.py @@ -0,0 +1,34 @@ +import torch +import intel_extension_for_pytorch as ipex +from transformers import GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer, LlamaForCausalLM +def batch_encode(prompts, tokenizer, prompt_len=512): + input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding="max_length", max_length=len(prompts)) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("xpu") + #input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) + return input_tokens + + +def generate_prompt(model, tokenizer, prompts): + + input_tokens = batch_encode(prompts, tokenizer) + print(input_tokens) + generate_kwargs = dict(max_new_tokens=30, do_sample=False) + output_ids = model.generate(**input_tokens, **generate_kwargs) + print(output_ids) + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + return outputs + +if __name__ == '__main__': + + model = LlamaForCausalLM.from_pretrained("/flare/Aurora_deployment/vsastry/hf_new_cp/") + model.to("xpu") # model.cuda() + model.seqlen = 4096 + + # get llama tokenizer + tokenizer = LlamaTokenizer.from_pretrained("/flare/Aurora_deployment/AuroraGPT/datasets/dolma/utils/tokenizer.model") + tokenizer.pad_token = tokenizer.eos_token + output = generate_prompt(model, tokenizer, prompts=["What is the language spoken in Mexico ?"]) + print(output) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh new file mode 100644 index 0000000000..1350ea0f2a --- /dev/null +++ b/train_aGPT_7B.sh @@ -0,0 +1,40 @@ +#!/bin/bash --login +#PBS -q lustre_scaling +#PBS -A Aurora_Deployment +#PBS -j oe + +##################################### +# AuroraGPT-7B +# +# Main production script for training +# AuroraGPT-7B @ ALCF +##################################### + +# 1. Navigate into `$PBS_O_WORKDIR` +cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE +GIT_BRANCH=$(git branch --show-current) && export GIT_BRANCH + + +# 2. source `ALCF/helpers.sh` +source "${HERE}/ALCF/helpers.sh" || exit + +# 3. call `setup` from `./ALCF/helpers.sh` +setup "$@" || exit +# export run_cmd="${run_cmd}" +echo "${run_cmd[@]}" | tee -a "${OUTPUT_LOG}" + +# 4. Tell user where to find output +printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" + +# # 5. Ignore the following strings on Intel XPU devices +# # (otherwise they'll clutter up logs) +# XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio" + +# if [[ $(ezpz_get_machine_name) == "aurora" ]]; then +# module unload mpich && module load mpich +# fi +# +# 6. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} +# eval "${run_cmd[@]}" |& tee -a "${OUTPUT_LOG}" +eval "${run_cmd[*]}" |& tee -a "${OUTPUT_LOG}" diff --git a/train_agpt_polaris_7B_production.sh b/train_agpt_polaris_7B_production.sh new file mode 100644 index 0000000000..f83b6ebc29 --- /dev/null +++ b/train_agpt_polaris_7B_production.sh @@ -0,0 +1,29 @@ +#!/bin/bash --login +# +# This script can be submitted with `qsub` via: +# +# ```bash +# $ git clone https://github.com/argonee-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ qsub train_agpt_polaris_7B_production.sh +# ``` + +cd "${PBS_O_WORKDIR}" || exit + +TODAY="$(date "+%Y-%m-%d")" +NOW="$(date "+%Y-%m-%d-%H%M%S")" +OUTDIR="${PBS_O_WORKDIR}/pbslogs/${TODAY}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +mkdir -p $(dirname "${OUTFILE}") + +echo "${OUTFILE}" >> "$(dirname ${OUTDIR})/latest" +echo "Logging job output to: ${OUTFILE}" + +# export DEBUG=1 +# export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 + +# Path to the data file list: +DFL="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" + +# Launch: +MICRO_BATCH=2 DATA_FILE_LIST="${DFL}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh new file mode 100644 index 0000000000..259552ffbc --- /dev/null +++ b/train_llama_alcf.sh @@ -0,0 +1,49 @@ +#!/bin/bash --login + +############################################################################### +# Check if running in DEBUG=1 mode. +# - If so, this will print each command before it is ran and exit if any of +# them return a nonzero exit status. +############################################################################### +if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh` + printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" + set -euxo pipefail +fi + +############################################################################### +# Print (but DO NOT EXECUTE !!) each command that would be ran. +# +# Enable with: NOOP=1 PBS_O_WORKDIR=$(pwd) bash train_llama_alcf.sh +############################################################################### +if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh` + echo "Run NOOP mode" + set -o noexec # same as set -n +fi + +XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio" + +##################### +# MAIN PROGRAM LOGIC +##################### +main() { + # 1. Navigate into `$PBS_O_WORKDIR` + cd "${PBS_O_WORKDIR}" || exit + HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE + # 2. source `ALCF/helpers.sh` + source "${HERE}/ALCF/helpers.sh" || exit + # 3. call `setup` from `./ALCF/helpers.sh` + setup "$@" || exit + # 4. Take custom args + export custom_args=" $@" + # 5. Update ${run_cmd} (from setup ALCF/helpers.sh) with ${custom_args} + export run_cmd="${run_cmd} ${custom_args}" + # 6. Add "${run_cmd}" to output log + echo "${run_cmd}" | tee -a "${OUTPUT_LOG}" + # 7. Tell user where to find output + printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}" + # 8. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG} + eval "${run_cmd}" |& grep -E -v "${XPU_IGNORE_STRING}" |& tee -a "${OUTPUT_LOG}" + set +x +} + +main diff --git a/train_llama_alcf_aurora_qsub.sh b/train_llama_alcf_aurora_qsub.sh new file mode 100755 index 0000000000..6f247da9c8 --- /dev/null +++ b/train_llama_alcf_aurora_qsub.sh @@ -0,0 +1,7 @@ +#!/bin/bash --login + + +cd "${PBS_O_WORKDIR}" || exit +eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 +source /home/foremans/anl_24_release_q4/llm.devkit/setenv.sh +bash ./train_llama_alcf_aurora.sh diff --git a/train_llama_alcf_polaris_hzheng.sh b/train_llama_alcf_polaris_hzheng.sh new file mode 100755 index 0000000000..83d8a2c5a7 --- /dev/null +++ b/train_llama_alcf_polaris_hzheng.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug-scaling +#PBS -l select=2 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh + +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) +export TP=1 +export PP=1 +export MBS=1 +export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) +export SP=$((PBS_JOBSIZE*PPN/PP/TP)) +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" +echo "BS: $BS - PP:$PP - TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +# First time running, it will compile the fused kernels, which will take about 10 mins +# >>> done with compiling and loading fused kernels. Compilation time: 545.468 seconds + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +#LAUNCHER="//eagle/argonne_tpc/soft/Megatron-DeepSpeed/..//conda/2024-03-11/lib/python3.10/site-packages/deepspeed/launcher/launcher_helper.py --launcher mpich " +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 /eagle/argonne_tpc/soft/Megatron-DeepSpeed/local_rank.sh python3 $LAUNCHER ./pretrain_gpt_alcf.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 10 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ${MD}/outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed \ + --data-cache-path ./data_cache_path/ diff --git a/train_llama_nersc_perlmutter.sh b/train_llama_nersc_perlmutter.sh new file mode 100644 index 0000000000..8131579809 --- /dev/null +++ b/train_llama_nersc_perlmutter.sh @@ -0,0 +1,141 @@ +#!/bin/bash --login +#SBATCH -A m4388_g +#SBATCH -C 'gpu&hbm80g' +#SBATCH -q regular +#SBATCH -t 00:30:00 +#SBATCH --nodes 128 +#SBATCH --gpus 512 +# + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- +# cd "${PBS_O_WORKDIR}" || exit +cd "${SLURM_SUBMIT_DIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE +# dflfb="${HERE}/genslm-subsample.txt" +# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST:-${dflfb}}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# Take custom args +custom_args=" $@" + +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + +# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit +# echo "Using $(which python3)" +# --launcher_args='--pmi=pmix' + # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + # ${launch_cmd} \ + # --optimizer adam \ + # --use-flash-attn-v2 \ + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ +# source ezpz/src/ezpz/bin/getjobenv || exit +# if [[ -z "${DIST_LAUNCH}" ]]; then +# setupSrun || exit +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# else +# SRUN_EXEC="${DIST_LAUNCH}" +# fi +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# +export NHOSTS="${SLURM_NNODES:-1}" +export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" +export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" +export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + + # srun --gpus ${NGPUS} \ + # --gpus-per-node ${NGPU_PER_HOST} \ + # -N ${NHOSTS} \ + # -n ${NGPUS} \ + # -l -u --verbose python3 ${EXEC} \ +run_cmd=" + ${SRUN_EXEC} python3 ${EXEC} \ + --$DTYPE \ + --optimizer ${OPT} \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + +run_cmd=$(echo "${run_cmd}" | sed -e 's/ */ /g') + +# echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "! Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +# echo "${OUTPUT_LOG}" +eval "${run_cmd}" +set +x