Skip to content

Commit

Permalink
Try uploading assets
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Aug 23, 2023
1 parent 2806fc3 commit e71642b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 11 deletions.
15 changes: 14 additions & 1 deletion .github/workflows/unique_python_files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,21 @@ jobs:

- name: Run
run: |
./target/optimized/data links/dataset.txt data/ --limit=5
./target/optimized/data links/dataset.txt data/
- name: Check
run: |
ls -la data/
- name: Fetch Latest Release
id: get_release
uses: gregziegan/[email protected]
with:
github_token: ${{ github.token }}

- name: Upload Assets
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ steps.get_release.outputs.upload_url }}
asset_path: ${{ github.workspace }}/data/combined.parquet
asset_name: unique_python_files.parquet
5 changes: 5 additions & 0 deletions sql/only_python_files.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT archive_path, hash, lines, project_name, project_version, size, uploaded_on
FROM input_dataset
where skip_reason = ''
and archive_path ILIKE '%.py' and size != 0
order by project_name, project_version, archive_path;
25 changes: 15 additions & 10 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,13 @@ async fn main() -> Result<()> {

let download_dir = args.working_directory.join("downloads");
let output_dir = args.working_directory.join("output");
let only_python_dir = args.working_directory.join("only_python");
let final_output_dir = args.working_directory.join("final");
let combined_parquet_file = args.working_directory.join("combined.parquet");
tokio::fs::create_dir_all(&args.working_directory).await?;
tokio::fs::create_dir_all(&download_dir).await?;
tokio::fs::create_dir_all(&output_dir).await?;
// tokio::fs::create_dir_all(&final_output_dir).await?;
tokio::fs::create_dir_all(&only_python_dir).await?;

let urls_file = BufReader::new(File::open(&args.urls_file).await?);
let mut lines = urls_file.lines();
Expand All @@ -59,9 +60,11 @@ async fn main() -> Result<()> {
for (idx, url) in urls.into_iter().enumerate() {
let path = download_dir.join(format!("url-{}.parquet", idx));
let output_dir = output_dir.join(format!("url-{}/", idx));
let only_python_dir = only_python_dir.join(format!("url-{}/", idx));
download_file(&url, &path).await?;
get_unique_python_files(&path, &output_dir, include_str!("../sql/unique_files.sql"))
.await?;
run_sql(&path, &output_dir, include_str!("../sql/unique_files.sql")).await?;
// run_sql(&path, &only_python_dir, include_str!("../sql/only_python_files.sql"))
// .await?;
tokio::fs::remove_file(&path).await?;
}

Expand All @@ -79,7 +82,7 @@ async fn main() -> Result<()> {

println!("Reducing combined files to unique records");

get_unique_python_files(
run_sql(
&combined_parquet_file,
&final_output_dir,
include_str!("../sql/unique_files_combined.sql"),
Expand All @@ -93,12 +96,14 @@ async fn main() -> Result<()> {
.flatten()
.collect();

println!("Finally reducing {} files to {}", all_files.len(), combined_parquet_file.display());
println!(
"Finally reducing {} files to {}",
all_files.len(),
combined_parquet_file.display()
);

tokio::task::spawn_blocking(move || {
combine_parquet_files(&all_files, &combined_parquet_file)
})
.await??;
tokio::task::spawn_blocking(move || combine_parquet_files(&all_files, &combined_parquet_file))
.await??;

Ok(())
}
Expand Down Expand Up @@ -133,7 +138,7 @@ async fn download_file(url: &str, path: &Path) -> Result<()> {
Ok(())
}

async fn get_unique_python_files(path: &Path, output: &Path, sql: &str) -> Result<()> {
async fn run_sql(path: &Path, output: &Path, sql: &str) -> Result<()> {
let ctx = SessionContext::new();
let read_options = ParquetReadOptions::default().parquet_pruning(true);
ctx.register_parquet("input_dataset", path.to_str().unwrap(), read_options)
Expand Down

0 comments on commit e71642b

Please sign in to comment.