diff --git a/.gitignore b/.gitignore index 102d9fc..67f6d4a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ tipps_* .gitignore helper dumpster/ +*.bat diff --git a/README.md b/README.md index 4c9be61..fcd5c8f 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # rosinenpicker -![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) +![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker) -'Rosinenpicker' is German for 'cherry picker'. Which explains what this program does: it goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program. +'Rosinenpicker' is German for 'cherry picker' (never mind that 'Rosine' actually means *raisin*). Be it as it may - cherry picking is what `rosinenpicker` has been designed to do. It goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program. # Installation @@ -16,4 +16,16 @@ This should add the executable `rosinenpicker` to `PATH`, making it accessible f # Usage -coming soon \ No newline at end of file +Please type + +``` +rosenpicker -c config_file -d database_file +``` + +where `config_file` (default: `config.yml`) and `database_file` (default: `matches.db`) represent a yml-formatted configuration file (please see sample [config.yml](configs/config.yml), which is more or less self-explanatory) and a sqlite database file (automatically created if not present), respectively. + +For help type + +``` +rosenpicker -h +``` diff --git a/configs/config.yml b/configs/config.yml new file mode 100644 index 0000000..b420a94 --- /dev/null +++ b/configs/config.yml @@ -0,0 +1,36 @@ +# Configuration file + +# Title: contains information that helps you identify the processing run +title: 'A beautiful day' + +# Strategies: this section contains parameters that guide the processing of the files +strategies: + # List your strategies here; name strategies as you please + my_strategy_01: + # processed_directory: the folder where to be processed files are looked for and hopefully found + processed_directory: '/path/to/your_directory' + # file_selection_pattern: a regex pattern selecting the files to be processed + file_name_pattern: 'regex_pattern_identifying_your_file' + # optional: file_content_pattern - a regex pattern that has to return a match in the file contents + # this can be used to filter to be processed files in addition to file name + #file_content_pattern: '.*' + # file_format + # currently only 'pdf' and 'txt' are supported + file_format: 'pdf' + # terms + terms: + # Chose names for the terms and associate each with a regex pattern or, alternatively, + # two regex patterns surrounding '~@~', which serves as a divider. + # In the former case (i.e. only one regex pattern, no divider) matches to the regex are returned. + # In the latter case (i.e. two regex patterns, divider present) the two regex patterns are converted + # to groups surrounding a central "match-all" (.*) pattern. Only matches to the central group are returned. + my_first_term: 'regex1' + my_second_term: 'regex2@@@regex3' + # export format + # currently, the following formats are supported: csv, xlsx, html, json + export_format: 'xlsx' + # export path + export_path: '/path/to/your/file.xlsx' + # optional, for csv, set export_csv_divider (defaults to ;) + #export_csv_divider: ';' + diff --git a/pyproject.toml b/pyproject.toml index 3038161..fbd5d8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,8 +6,10 @@ build-backend = "hatchling.build" exclude = [ "pdfs/", "configs/", - ".github", - "helper", + ".github/", + "helper/", + "dumpster/", + "*.bat", ] [project] @@ -27,6 +29,7 @@ classifiers = [ "Environment :: Console", ] dependencies = [ + "pandas>=2.2.0", "pydantic>=2.6.1", "pyyaml>=6.0.1", "pymupdf>=1.23.22", diff --git a/src/rosinenpicker/exporter.py b/src/rosinenpicker/exporter.py new file mode 100644 index 0000000..9cc64d7 --- /dev/null +++ b/src/rosinenpicker/exporter.py @@ -0,0 +1,41 @@ +import pandas as pd + +class BaseExporter: + dataframe: pd.DataFrame + export_path: str = None + + def __init__(self, dataframe: pd.DataFrame) -> None: + self.dataframe = dataframe + + def export(self, export_path: str) -> None: + raise NotImplementedError("This method should be implemented by subclasses.") + +class CSVExporter(BaseExporter): + _separator: str = ";" + + @property + def separator(self) -> str: + return self._separator + + @separator.setter + def separator(self, newsep = str) -> None: + self._separator = newsep + + def export(self, export_path: str) -> None: + self.export_path = export_path + self.dataframe.to_csv(export_path, sep = self.separator) + +class XLSXExporter(BaseExporter): + def export(self, export_path: str) -> None: + self.export_path = export_path + self.dataframe.to_excel(export_path) + +class HTMLExporter(BaseExporter): + def export(self, export_path: str) -> None: + self.export_path = export_path + self.dataframe.to_html(export_path, ) + +class JSONExporter(BaseExporter): + def export(self, export_path: str) -> None: + self.export_path = export_path + self.dataframe.to_json(export_path, orient = 'table') diff --git a/src/rosinenpicker/pydantic_models.py b/src/rosinenpicker/pydantic_models.py index ea41d7c..e41e08b 100644 --- a/src/rosinenpicker/pydantic_models.py +++ b/src/rosinenpicker/pydantic_models.py @@ -16,7 +16,7 @@ class ConfigStrategy(BaseModel): terms: dict[str, str] export_format: str export_path: NewPath - export_csv_divider: str = ';' + export_csv_divider: Optional[str] = ';' # terms_patterns_group is created from 'terms', see @model_validator terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None matchall_maxlength: int = 100 @@ -52,7 +52,7 @@ def check_terms_and_patterns(self): @field_validator('export_format') @classmethod def validate_export_format(cls, ef: str): - valid_formats = {"csv"} + valid_formats = {"csv", "html", "json", "xlsx"} if ef not in valid_formats: raise ConfigError(msg=f"Concerning '{ef}': Export format must conform to one of these options: {valid_formats}!") return ef diff --git a/src/rosinenpicker/start.py b/src/rosinenpicker/start.py index eefd08d..4512b43 100644 --- a/src/rosinenpicker/start.py +++ b/src/rosinenpicker/start.py @@ -1,12 +1,14 @@ -__version__ = '0.0.4' +__version__ = '0.0.5' import yaml import re import os +import pandas as pd from .pydantic_models import Config, ConfigStrategy, ConfigError from .database import Base, DbRun, DbStrategy, DbProcessedFile, DbMatch from .utils import file_sha256 +from .exporter import BaseExporter, CSVExporter, XLSXExporter, HTMLExporter, JSONExporter from .processors import DocumentProcessor, PDFProcessor, TXTProcessor -from sqlalchemy import create_engine +from sqlalchemy import create_engine, select from sqlalchemy.orm import sessionmaker, Session import argparse @@ -23,8 +25,8 @@ def find_documents(directory, file_name_pattern) -> list[str]: pattern = re.compile(file_name_pattern) return [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)] -def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, run_id: int, - processor: DocumentProcessor): +def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id: int, + processor: DocumentProcessor, exporter: BaseExporter): # save strategy info d_strategy = DbStrategy(run_id = run_id, name = strategy_name, @@ -34,14 +36,15 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r export_format = cs.export_format, export_path = str(cs.export_path), export_csv_divider = cs.export_csv_divider) - session.add(d_strategy) - session.commit() + db.add(d_strategy) + db.commit() - # loop thru files + # locate files (documents) documents = find_documents(cs.processed_directory, cs.file_name_pattern) if len(documents) < 1: raise ConfigError(f"No matching documents found with pattern {cs.file_name_pattern} in directory {cs.processed_directory}!") + # loop thru documents for doc in documents: pr = processor(doc, cs.matchall_maxlength) # if file_content_pattern is given and if that pattern is not found in the document, skip the document @@ -53,8 +56,8 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r d_file = DbProcessedFile(strategy_id = d_strategy.id, filename = os.path.basename(doc), sha256 = file_sha256(doc)) - session.add(d_file) - session.commit() + db.add(d_file) + db.commit() # get content from file terms_content = pr.terms_content(cs.terms_patterns_group) @@ -62,8 +65,30 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r # save content into database for term, content in terms_content.items(): dm = DbMatch(file_id = d_file.id, term = term, content = content) - session.add(dm) - session.commit() + db.add(dm) + db.commit() + + # export + # formulate sql query filtering for the current strategy + sql = select(DbStrategy.id, DbProcessedFile.id, DbProcessedFile.filename, DbMatch.term, DbMatch.content)\ + .join(DbStrategy.processed_files).join(DbProcessedFile.matches)\ + .filter(DbStrategy.id == d_strategy.id) + + # translate into pandas dataframe + df = pd.read_sql_query(sql=sql, con=db.connection()) + + # pivot to wide + df_wide = df.pivot(index=['filename'], columns='term', values='content') + + # pass wide dataframe to exporter + exp = exporter(df_wide) + + # set separator for csv only + if cs.export_format == "csv": + exporter.separator = cs.export_csv_divider + + # write to file + exp.export(cs.export_path) def main(config_file: str, db_file: str): config = read_config_file(config_file) @@ -71,25 +96,28 @@ def main(config_file: str, db_file: str): Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) - session = Session() + db = Session() - # processor options according to file_format + # processor and exporter options according to file_format and export_format file_format_options = {"pdf": PDFProcessor, "txt": TXTProcessor} + export_format_options = {"csv": CSVExporter, "xlsx": XLSXExporter, "html": HTMLExporter, "json": JSONExporter} # save run info run = DbRun(title = config.title, yml_filename = config_file, yml_sha256 = file_sha256(config_file)) - session.add(run) - session.commit() + db.add(run) + db.commit() for strategy_name, strategy in config.strategies.items(): # processor is chosen according to strategy.file_format processor = file_format_options[strategy.file_format] + # exporter is chosen according to strategy.export_format + exporter = export_format_options[strategy.export_format] # process using correct processor - process_strategy(strategy_name, strategy, session, run.id, processor) + process_strategy(strategy_name, strategy, db, run.id, processor, exporter) - session.close() + db.close() def cli(): parser = argparse.ArgumentParser(description="A package for picking the juciest text morsels out of a pile of documents.")