Skip to content

Commit

Permalink
Remove lto
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Aug 23, 2023
1 parent cbc977b commit c2093e7
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 6 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/unique_python_files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
toolchain: stable
override: true

- name: Install parquet-cli
run: cargo install parquet --features=cli

- uses: Swatinem/rust-cache@v2
with:
prefix-key: unique-python-files-
Expand All @@ -37,3 +40,7 @@ jobs:
- name: Run
run: |
./target/optimized/data links/dataset.txt data/
- name: Combine
run: |
parquet-concat data/combined.parquet data/output/**/*.parquet
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ snmalloc-rs = "0.3"

[profile.optimized]
inherits = "release"
lto = true
codegen-units = 1
#lto = true
#codegen-units = 1
1 change: 1 addition & 0 deletions sql/unique_files.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ FROM (SELECT hash,
FROM input_dataset
where skip_reason = ''
and archive_path ILIKE '%.py' and size != 0) as ordered;
WHERE n = 1
8 changes: 8 additions & 0 deletions sql/unique_files_combined.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
SELECT hash, repository, uploaded_on
FROM (SELECT hash,
repository,
uploaded_on,
ROW_NUMBER() OVER (PARTITION BY encode(hash, 'hex') order by uploaded_on) as n
FROM input_dataset
) as ordered;
WHERE n = 1
9 changes: 5 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ async fn main() -> Result<()> {
let path = download_dir.join(format!("url-{}.parquet", idx));
let output_dir = output_dir.join(format!("url-{}/", idx));
download_file(&url, &path).await?;
get_unique_python_files(&path, &output_dir).await?;
get_unique_python_files(&path, &output_dir, include_str!("../sql/unique_files.sql")).await?;
tokio::fs::remove_file(&path).await?;
}
get_unique_python_files(Path::new("data/combined.parquet"), Path::new("data/combined/"), include_str!("../sql/unique_files_combined.sql")).await?;
Ok(())
}

Expand Down Expand Up @@ -82,12 +83,12 @@ async fn download_file(url: &str, path: &Path) -> Result<()> {
Ok(())
}

async fn get_unique_python_files(path: &Path, output: &Path) -> Result<()> {
async fn get_unique_python_files(path: &Path, output: &Path, sql: &str) -> Result<()> {
let ctx = SessionContext::new();
let read_options = ParquetReadOptions::default().parquet_pruning(true);
ctx.register_parquet("input_dataset", path.to_str().unwrap(), read_options).await?;

let df = ctx.sql(include_str!("../sql/unique_files.sql")).await.unwrap();
let df = ctx.sql(sql).await.unwrap();

let props = WriterProperties::builder()
.set_compression(Compression::ZSTD(ZstdLevel::try_new(13).unwrap()))
Expand All @@ -99,4 +100,4 @@ async fn get_unique_python_files(path: &Path, output: &Path) -> Result<()> {
Some(props),
).await?;
Ok(())
}
}

0 comments on commit c2093e7

Please sign in to comment.