Skip to content

Commit

Permalink
Merge pull request #159 from monarch-initiative/lint_quick_fix
Browse files Browse the repository at this point in the history
Lint/Format fixes
  • Loading branch information
ptgolden authored Nov 6, 2024
2 parents fceafe5 + 59bc851 commit 8a3bab9
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 26 deletions.
24 changes: 15 additions & 9 deletions src/koza/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,16 @@ def _check_row_count(type: Literal["node", "edge"]):
_check_row_count("edge")


def split_file(file: str,
fields: str,
format: OutputFormat = OutputFormat.tsv,
remove_prefixes: bool = False,
output_dir: str = "./output"):
def split_file(
file: str,
fields: str,
format: OutputFormat = OutputFormat.tsv,
remove_prefixes: bool = False,
output_dir: str = "./output",
):
db = duckdb.connect(":memory:")

#todo: validate that each of the fields is actually a column in the file
# todo: validate that each of the fields is actually a column in the file
if format == OutputFormat.tsv:
read_file = f"read_csv('{file}')"
elif format == OutputFormat.json:
Expand Down Expand Up @@ -179,14 +181,18 @@ def get_filename_suffix(name):
for row in list_of_value_dicts:
# export to a tsv file named with the values of the pivot fields
where_clause = ' AND '.join([f"{k} = '{row[k]}'" for k in keys])
file_name = output_dir + "/" + get_filename_prefix(file) + generate_filename_from_row(row) + get_filename_suffix(file)
file_name = (
output_dir + "/" + get_filename_prefix(file) + generate_filename_from_row(row) + get_filename_suffix(file)
)
print(f"writing {file_name}")
db.execute(f"""
db.execute(
f"""
COPY (
SELECT * FROM {read_file}
WHERE {where_clause}
) TO '{file_name}' (HEADER, DELIMITER '\t');
""")
"""
)


def validate_file(
Expand Down
9 changes: 7 additions & 2 deletions src/koza/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,20 @@ def validate(
"""Validate a source file"""
validate_file(file, format, delimiter, header_delimiter, skip_blank_lines)


@typer_app.command()
def split(
file: str = typer.Argument(..., help="Path to the source kgx file to be split"),
fields: str = typer.Argument(..., help="Comma separated list of fields to split on"),
remove_prefixes: bool = typer.Option(False, help="Remove prefixes from the file names for values from the specified fields. (e.g, NCBIGene:9606 becomes 9606"),
remove_prefixes: bool = typer.Option(
False,
help="Remove prefixes from the file names for values from the specified fields. (e.g, NCBIGene:9606 becomes 9606",
),
output_dir: str = typer.Option(default="output", help="Path to output directory"),
):
"""Split a file by fields"""
split_file(file, fields,remove_prefixes=remove_prefixes, output_dir=output_dir)
split_file(file, fields, remove_prefixes=remove_prefixes, output_dir=output_dir)


if __name__ == "__main__":
typer_app()
8 changes: 4 additions & 4 deletions src/koza/model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ def __next__(self) -> Dict[str, Any]:
return row

def _get_row(self):
#If we built a filter for this source, run extra code to validate each row for inclusion in the final output.
# If we built a filter for this source, run extra code to validate each row for inclusion in the final output.
if self._filter:
row = next(self._reader)
reject_current_row = not self._filter.include_row(row)
#If the filter says we shouldn't include the current row; we filter it out and move onto the next row.
#We'll only break out of the following loop if "reject_current_row" is false (i.e. include_row is True/we
#have a valid row to return) or we hit a StopIteration exception from self._reader.
# If the filter says we shouldn't include the current row; we filter it out and move onto the next row.
# We'll only break out of the following loop if "reject_current_row" is false (i.e. include_row is True/we
# have a valid row to return) or we hit a StopIteration exception from self._reader.
while reject_current_row:
row = next(self._reader)
reject_current_row = not self._filter.include_row(row)
Expand Down
22 changes: 11 additions & 11 deletions src/koza/utils/row_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,22 +54,22 @@ def include_row(self, row) -> bool:
return include_row

def inlist(self, column_value, filter_values):
#Check if the passed in column is exactly matched against
#For a filter_list of ['abc','def','ghi']; this will be true
#for column_value 'abc' but not 'abcde.'
# Check if the passed in column is exactly matched against
# For a filter_list of ['abc','def','ghi']; this will be true
# for column_value 'abc' but not 'abcde.'
col_exact_match = column_value in filter_values
#The following iterates through all filters and will return true if
#the text of the filter is found within the column_value.
#So for the above example this boolean will return True, because :'abc' in 'abcde': returns True.
if(type(column_value)==str):
# The following iterates through all filters and will return true if
# the text of the filter is found within the column_value.
# So for the above example this boolean will return True, because :'abc' in 'abcde': returns True.
if type(column_value) == str:
col_inexact_match = any([filter_value in column_value for filter_value in filter_values])
else:
col_inexact_match = False
return col_exact_match or col_inexact_match

def inlist_exact(self, column_value, filter_values):
#Check if the passed in column is exactly matched against
#For a filter_list of ['abc','def','ghi']; this will be true
#for column_value 'abc' but not 'abcde.'
# Check if the passed in column is exactly matched against
# For a filter_list of ['abc','def','ghi']; this will be true
# for column_value 'abc' but not 'abcde.'
col_exact_match = column_value in filter_values
return col_exact_match

0 comments on commit 8a3bab9

Please sign in to comment.