add export capacity, implement csv, xlsx, json, and html

joheli · Feb 16, 2024 · a6a832e · a6a832e
1 parent 0aa8daf
commit a6a832e
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 24 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ tipps_*
 .gitignore
 helper
 dumpster/
+*.bat
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # rosinenpicker
 
-![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg)
+![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker)
 
-'Rosinenpicker' is German for 'cherry picker'. Which explains what this program does: it goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program. 
+'Rosinenpicker' is German for 'cherry picker' (never mind that 'Rosine' actually means *raisin*). Be it as it may - cherry picking is what `rosinenpicker` has been designed to do. It goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program. 
 
 # Installation
 
@@ -16,4 +16,16 @@ This should add the executable `rosinenpicker` to `PATH`, making it accessible f
 
 # Usage
 
-coming soon
+Please type
+
+```
+rosenpicker -c config_file -d database_file
+```
+
+where `config_file` (default: `config.yml`) and `database_file` (default: `matches.db`) represent a yml-formatted configuration file (please see sample [config.yml](configs/config.yml), which is more or less self-explanatory) and a sqlite database file (automatically created if not present), respectively.
+
+For help type
+
+```
+rosenpicker -h
+```
diff --git a/configs/config.yml b/configs/config.yml
@@ -0,0 +1,36 @@
+# Configuration file
+
+# Title: contains information that helps you identify the processing run
+title: 'A beautiful day'
+
+# Strategies: this section contains parameters that guide the processing of the files
+strategies:
+ # List your strategies here; name strategies as you please
+ my_strategy_01: 
+ # processed_directory: the folder where to be processed files are looked for and hopefully found
+ processed_directory: '/path/to/your_directory'
+ # file_selection_pattern: a regex pattern selecting the files to be processed
+ file_name_pattern: 'regex_pattern_identifying_your_file'
+ # optional: file_content_pattern - a regex pattern that has to return a match in the file contents
+ # this can be used to filter to be processed files in addition to file name
+ #file_content_pattern: '.*'
+ # file_format
+ # currently only 'pdf' and 'txt' are supported
+ file_format: 'pdf'
+ # terms
+ terms:
+ # Chose names for the terms and associate each with a regex pattern or, alternatively,
+ # two regex patterns surrounding '~@~', which serves as a divider.
+ # In the former case (i.e. only one regex pattern, no divider) matches to the regex are returned.
+ # In the latter case (i.e. two regex patterns, divider present) the two regex patterns are converted
+ # to groups surrounding a central "match-all" (.*) pattern. Only matches to the central group are returned.
+ my_first_term: 'regex1'
+ my_second_term: 'regex2@@@regex3'
+ # export format
+ # currently, the following formats are supported: csv, xlsx, html, json
+ export_format: 'xlsx'
+ # export path
+ export_path: '/path/to/your/file.xlsx'
+ # optional, for csv, set export_csv_divider (defaults to ;)
+ #export_csv_divider: ';'
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,8 +6,10 @@ build-backend = "hatchling.build"
 exclude = [
  "pdfs/",
  "configs/",
- ".github",
- "helper",
+ ".github/",
+ "helper/",
+ "dumpster/",
+ "*.bat",
 ]
 
 [project]
@@ -27,6 +29,7 @@ classifiers = [
  "Environment :: Console",
 ]
 dependencies = [
+ "pandas>=2.2.0",
  "pydantic>=2.6.1",
  "pyyaml>=6.0.1",
  "pymupdf>=1.23.22",

diff --git a/src/rosinenpicker/exporter.py b/src/rosinenpicker/exporter.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+class BaseExporter:
+ dataframe: pd.DataFrame
+ export_path: str = None
+
+ def __init__(self, dataframe: pd.DataFrame) -> None:
+ self.dataframe = dataframe
+
+ def export(self, export_path: str) -> None:
+ raise NotImplementedError("This method should be implemented by subclasses.")
+
+class CSVExporter(BaseExporter):
+ _separator: str = ";"
+
+ @property
+ def separator(self) -> str:
+ return self._separator
+
+ @separator.setter
+ def separator(self, newsep = str) -> None:
+ self._separator = newsep
+
+ def export(self, export_path: str) -> None:
+ self.export_path = export_path
+ self.dataframe.to_csv(export_path, sep = self.separator)
+
+class XLSXExporter(BaseExporter):
+ def export(self, export_path: str) -> None:
+ self.export_path = export_path
+ self.dataframe.to_excel(export_path)
+
+class HTMLExporter(BaseExporter):
+ def export(self, export_path: str) -> None:
+ self.export_path = export_path
+ self.dataframe.to_html(export_path, )
+
+class JSONExporter(BaseExporter):
+ def export(self, export_path: str) -> None:
+ self.export_path = export_path
+ self.dataframe.to_json(export_path, orient = 'table')
diff --git a/src/rosinenpicker/pydantic_models.py b/src/rosinenpicker/pydantic_models.py
@@ -16,7 +16,7 @@ class ConfigStrategy(BaseModel):
  terms: dict[str, str]
  export_format: str
  export_path: NewPath
- export_csv_divider: str = ';'
+ export_csv_divider: Optional[str] = ';'
  # terms_patterns_group is created from 'terms', see @model_validator
  terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None
  matchall_maxlength: int = 100
@@ -52,7 +52,7 @@ def check_terms_and_patterns(self):
  @field_validator('export_format')
  @classmethod
  def validate_export_format(cls, ef: str):
- valid_formats = {"csv"}
+ valid_formats = {"csv", "html", "json", "xlsx"}
  if ef not in valid_formats:
  raise ConfigError(msg=f"Concerning '{ef}': Export format must conform to one of these options: {valid_formats}!")
  return ef

diff --git a/src/rosinenpicker/start.py b/src/rosinenpicker/start.py
@@ -1,12 +1,14 @@
-__version__ = '0.0.4'
+__version__ = '0.0.5'
 import yaml
 import re
 import os
+import pandas as pd
 from .pydantic_models import Config, ConfigStrategy, ConfigError
 from .database import Base, DbRun, DbStrategy, DbProcessedFile, DbMatch
 from .utils import file_sha256
+from .exporter import BaseExporter, CSVExporter, XLSXExporter, HTMLExporter, JSONExporter
 from .processors import DocumentProcessor, PDFProcessor, TXTProcessor
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, select
 from sqlalchemy.orm import sessionmaker, Session
 import argparse
 
@@ -23,8 +25,8 @@ def find_documents(directory, file_name_pattern) -> list[str]:
  pattern = re.compile(file_name_pattern)
  return [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)]
 
-def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, run_id: int, 
- processor: DocumentProcessor):
+def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id: int, 
+ processor: DocumentProcessor, exporter: BaseExporter):
  # save strategy info
  d_strategy = DbStrategy(run_id = run_id,
  name = strategy_name,
@@ -34,14 +36,15 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r
  export_format = cs.export_format,
  export_path = str(cs.export_path),
  export_csv_divider = cs.export_csv_divider)
- session.add(d_strategy)
- session.commit()
+ db.add(d_strategy)
+ db.commit()
 
- # loop thru files
+ # locate files (documents)
  documents = find_documents(cs.processed_directory, cs.file_name_pattern)
  if len(documents) < 1:
  raise ConfigError(f"No matching documents found with pattern {cs.file_name_pattern} in directory {cs.processed_directory}!")
 
+ # loop thru documents
  for doc in documents:
  pr = processor(doc, cs.matchall_maxlength)
  # if file_content_pattern is given and if that pattern is not found in the document, skip the document
@@ -53,43 +56,68 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r
  d_file = DbProcessedFile(strategy_id = d_strategy.id,
  filename = os.path.basename(doc),
  sha256 = file_sha256(doc))
- session.add(d_file)
- session.commit()
+ db.add(d_file)
+ db.commit()
 
  # get content from file
  terms_content = pr.terms_content(cs.terms_patterns_group)
 
  # save content into database
  for term, content in terms_content.items():
  dm = DbMatch(file_id = d_file.id, term = term, content = content)
- session.add(dm)
- session.commit()
+ db.add(dm)
+ db.commit()
+
+ # export
+ # formulate sql query filtering for the current strategy
+ sql = select(DbStrategy.id, DbProcessedFile.id, DbProcessedFile.filename, DbMatch.term, DbMatch.content)\
+ .join(DbStrategy.processed_files).join(DbProcessedFile.matches)\
+ .filter(DbStrategy.id == d_strategy.id)
+
+ # translate into pandas dataframe
+ df = pd.read_sql_query(sql=sql, con=db.connection())
+
+ # pivot to wide
+ df_wide = df.pivot(index=['filename'], columns='term', values='content')
+
+ # pass wide dataframe to exporter
+ exp = exporter(df_wide)
+
+ # set separator for csv only
+ if cs.export_format == "csv":
+ exporter.separator = cs.export_csv_divider
+
+ # write to file
+ exp.export(cs.export_path)
 
 def main(config_file: str, db_file: str):
  config = read_config_file(config_file)
  engine = create_engine(f'sqlite:///{db_file}')
  Base.metadata.create_all(engine)
 
  Session = sessionmaker(bind=engine)
- session = Session()
+ db = Session()
 
- # processor options according to file_format
+ # processor and exporter options according to file_format and export_format
  file_format_options = {"pdf": PDFProcessor, "txt": TXTProcessor}
+ export_format_options = {"csv": CSVExporter, "xlsx": XLSXExporter, "html": HTMLExporter, "json": JSONExporter}
 
  # save run info
  run = DbRun(title = config.title,
  yml_filename = config_file,
  yml_sha256 = file_sha256(config_file))
- session.add(run)
- session.commit()
+ db.add(run)
+ db.commit()
 
  for strategy_name, strategy in config.strategies.items():
  # processor is chosen according to strategy.file_format
  processor = file_format_options[strategy.file_format]
+ # exporter is chosen according to strategy.export_format
+ exporter = export_format_options[strategy.export_format]
  # process using correct processor
- process_strategy(strategy_name, strategy, session, run.id, processor)
+ process_strategy(strategy_name, strategy, db, run.id, processor, exporter)
 
- session.close()
+ db.close()
 
 def cli():
  parser = argparse.ArgumentParser(description="A package for picking the juciest text morsels out of a pile of documents.")