forked from askovdal/fhfrmlf-preprocessing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create-csv-from-dir.py
53 lines (39 loc) · 1.96 KB
/
create-csv-from-dir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import pandas as pd
def filter_suffix(filenames: list[str], suffix: str, negate: bool):
filtered_files = []
for filename in filenames:
# Remove the file extension from the filename
name_wo_extension = '.'.join(filename.split('.')[:-1])
if negate:
if not name_wo_extension.endswith(suffix):
# Remove the suffix from the filename
filtered_files.append(filename.replace(suffix, ''))
else:
if name_wo_extension.endswith(suffix):
# Remove the suffix from the filename
filtered_files.append(filename.replace(suffix, ''))
return filtered_files
def create_csv_from_dir(dir_name: str, csv_name: str, suffix='', length: int = None, negate=False):
"""
Creates a CSV from a directory of files.
:param dir_name: Name of directory with the files that go into the CSV.
:param csv_name: Name to use for the created CSV.
:param suffix: Optional suffix to filter the filenames on. Only files with this suffix wil be used.
:param length: The length of the CSV. Random files from the directory will be used.
:param negate: If True, only uses files without the suffix.
"""
filenames = os.listdir(dir_name)
if suffix:
filenames = filter_suffix(filenames, suffix, negate)
main_df = pd.read_csv('train.csv')
# Create a mask of the file paths that match `filenames`
mask = main_df['Path'].str.removeprefix('CheXpert-v1.0/train/').str.replace('/', '-').isin(filenames)
dir_df = main_df.loc[mask]
if length and length < len(dir_df.index):
dir_df = dir_df.sample(n=length)
# Shuffle rows
dir_df = dir_df.sample(frac=1)
dir_df.to_csv(csv_name, index=False)
create_csv_from_dir('subsets/pneumothorax-positive', 'subsets/pneumothorax-positive-w-tube.csv', '_TUBE', 300, False)
create_csv_from_dir('subsets/pneumothorax-positive', 'subsets/pneumothorax-positive-wo-tube.csv', '_TUBE', 300, True)