Skip to content

Commit

Permalink
Try this
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Aug 23, 2023
1 parent f56b749 commit ef95f52
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 6 deletions.
7 changes: 2 additions & 5 deletions .github/workflows/unique_python_files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ jobs:
toolchain: stable
override: true

- name: Install parquet-cli
run: cargo install parquet --features=cli

- uses: Swatinem/rust-cache@v2
with:
prefix-key: unique-python-files-
Expand All @@ -41,6 +38,6 @@ jobs:
run: |
./target/optimized/data links/dataset.txt data/
- name: Combine
- name: Check
run: |
parquet-concat data/combined.parquet data/output/**/*.parquet
ls -la data/
14 changes: 13 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ async fn main() -> Result<()> {

let download_dir = args.working_directory.join("downloads");
let output_dir = args.working_directory.join("output");
let combined_parquet_file = args.working_directory.join("combined.parquet");
let final_output_dir = args.working_directory.join("final");
let combined_parquet_file = args.working_directory.join("combined.parquet");
tokio::fs::create_dir_all(&args.working_directory).await?;
tokio::fs::create_dir_all(&download_dir).await?;
tokio::fs::create_dir_all(&output_dir).await?;
Expand Down Expand Up @@ -82,6 +82,18 @@ async fn main() -> Result<()> {
)
.await?;

tokio::fs::remove_file(&combined_parquet_file).await?;

let all_files: Vec<_> = glob::glob(&format!("{}/**/*.parquet", final_output_dir.display()))
.unwrap()
.flatten()
.collect();

tokio::task::spawn_blocking(move || {
combine_parquet_files(&all_files, &combined_parquet_file)
})
.await??;

Ok(())
}

Expand Down

0 comments on commit ef95f52

Please sign in to comment.