Skip to content

Commit

Permalink
add export capacity, implement csv, xlsx, json, and html
Browse files Browse the repository at this point in the history
  • Loading branch information
joheli committed Feb 16, 2024
1 parent 0aa8daf commit a6a832e
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 24 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ tipps_*
.gitignore
helper
dumpster/
*.bat
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# rosinenpicker

![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg)
![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker)

'Rosinenpicker' is German for 'cherry picker'. Which explains what this program does: it goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program.
'Rosinenpicker' is German for 'cherry picker' (never mind that 'Rosine' actually means *raisin*). Be it as it may - cherry picking is what `rosinenpicker` has been designed to do. It goes through a list of documents to extract *just those juicy bits* **you** are interested in. It uses regular expressions to accomplish this. But please do read on to learn how to use the program.

# Installation

Expand All @@ -16,4 +16,16 @@ This should add the executable `rosinenpicker` to `PATH`, making it accessible f

# Usage

coming soon
Please type

```
rosenpicker -c config_file -d database_file
```

where `config_file` (default: `config.yml`) and `database_file` (default: `matches.db`) represent a yml-formatted configuration file (please see sample [config.yml](configs/config.yml), which is more or less self-explanatory) and a sqlite database file (automatically created if not present), respectively.

For help type

```
rosenpicker -h
```
36 changes: 36 additions & 0 deletions configs/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Configuration file

# Title: contains information that helps you identify the processing run
title: 'A beautiful day'

# Strategies: this section contains parameters that guide the processing of the files
strategies:
# List your strategies here; name strategies as you please
my_strategy_01:
# processed_directory: the folder where to be processed files are looked for and hopefully found
processed_directory: '/path/to/your_directory'
# file_selection_pattern: a regex pattern selecting the files to be processed
file_name_pattern: 'regex_pattern_identifying_your_file'
# optional: file_content_pattern - a regex pattern that has to return a match in the file contents
# this can be used to filter to be processed files in addition to file name
#file_content_pattern: '.*'
# file_format
# currently only 'pdf' and 'txt' are supported
file_format: 'pdf'
# terms
terms:
# Chose names for the terms and associate each with a regex pattern or, alternatively,
# two regex patterns surrounding '~@~', which serves as a divider.
# In the former case (i.e. only one regex pattern, no divider) matches to the regex are returned.
# In the latter case (i.e. two regex patterns, divider present) the two regex patterns are converted
# to groups surrounding a central "match-all" (.*) pattern. Only matches to the central group are returned.
my_first_term: 'regex1'
my_second_term: 'regex2@@@regex3'
# export format
# currently, the following formats are supported: csv, xlsx, html, json
export_format: 'xlsx'
# export path
export_path: '/path/to/your/file.xlsx'
# optional, for csv, set export_csv_divider (defaults to ;)
#export_csv_divider: ';'

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ build-backend = "hatchling.build"
exclude = [
"pdfs/",
"configs/",
".github",
"helper",
".github/",
"helper/",
"dumpster/",
"*.bat",
]

[project]
Expand All @@ -27,6 +29,7 @@ classifiers = [
"Environment :: Console",
]
dependencies = [
"pandas>=2.2.0",
"pydantic>=2.6.1",
"pyyaml>=6.0.1",
"pymupdf>=1.23.22",
Expand Down
41 changes: 41 additions & 0 deletions src/rosinenpicker/exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd

class BaseExporter:
dataframe: pd.DataFrame
export_path: str = None

def __init__(self, dataframe: pd.DataFrame) -> None:
self.dataframe = dataframe

def export(self, export_path: str) -> None:
raise NotImplementedError("This method should be implemented by subclasses.")

class CSVExporter(BaseExporter):
_separator: str = ";"

@property
def separator(self) -> str:
return self._separator

@separator.setter
def separator(self, newsep = str) -> None:
self._separator = newsep

def export(self, export_path: str) -> None:
self.export_path = export_path
self.dataframe.to_csv(export_path, sep = self.separator)

class XLSXExporter(BaseExporter):
def export(self, export_path: str) -> None:
self.export_path = export_path
self.dataframe.to_excel(export_path)

class HTMLExporter(BaseExporter):
def export(self, export_path: str) -> None:
self.export_path = export_path
self.dataframe.to_html(export_path, )

class JSONExporter(BaseExporter):
def export(self, export_path: str) -> None:
self.export_path = export_path
self.dataframe.to_json(export_path, orient = 'table')
4 changes: 2 additions & 2 deletions src/rosinenpicker/pydantic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ConfigStrategy(BaseModel):
terms: dict[str, str]
export_format: str
export_path: NewPath
export_csv_divider: str = ';'
export_csv_divider: Optional[str] = ';'
# terms_patterns_group is created from 'terms', see @model_validator
terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None
matchall_maxlength: int = 100
Expand Down Expand Up @@ -52,7 +52,7 @@ def check_terms_and_patterns(self):
@field_validator('export_format')
@classmethod
def validate_export_format(cls, ef: str):
valid_formats = {"csv"}
valid_formats = {"csv", "html", "json", "xlsx"}
if ef not in valid_formats:
raise ConfigError(msg=f"Concerning '{ef}': Export format must conform to one of these options: {valid_formats}!")
return ef
Expand Down
62 changes: 45 additions & 17 deletions src/rosinenpicker/start.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
__version__ = '0.0.4'
__version__ = '0.0.5'
import yaml
import re
import os
import pandas as pd
from .pydantic_models import Config, ConfigStrategy, ConfigError
from .database import Base, DbRun, DbStrategy, DbProcessedFile, DbMatch
from .utils import file_sha256
from .exporter import BaseExporter, CSVExporter, XLSXExporter, HTMLExporter, JSONExporter
from .processors import DocumentProcessor, PDFProcessor, TXTProcessor
from sqlalchemy import create_engine
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker, Session
import argparse

Expand All @@ -23,8 +25,8 @@ def find_documents(directory, file_name_pattern) -> list[str]:
pattern = re.compile(file_name_pattern)
return [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)]

def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, run_id: int,
processor: DocumentProcessor):
def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id: int,
processor: DocumentProcessor, exporter: BaseExporter):
# save strategy info
d_strategy = DbStrategy(run_id = run_id,
name = strategy_name,
Expand All @@ -34,14 +36,15 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r
export_format = cs.export_format,
export_path = str(cs.export_path),
export_csv_divider = cs.export_csv_divider)
session.add(d_strategy)
session.commit()
db.add(d_strategy)
db.commit()

# loop thru files
# locate files (documents)
documents = find_documents(cs.processed_directory, cs.file_name_pattern)
if len(documents) < 1:
raise ConfigError(f"No matching documents found with pattern {cs.file_name_pattern} in directory {cs.processed_directory}!")

# loop thru documents
for doc in documents:
pr = processor(doc, cs.matchall_maxlength)
# if file_content_pattern is given and if that pattern is not found in the document, skip the document
Expand All @@ -53,43 +56,68 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, session: Session, r
d_file = DbProcessedFile(strategy_id = d_strategy.id,
filename = os.path.basename(doc),
sha256 = file_sha256(doc))
session.add(d_file)
session.commit()
db.add(d_file)
db.commit()

# get content from file
terms_content = pr.terms_content(cs.terms_patterns_group)

# save content into database
for term, content in terms_content.items():
dm = DbMatch(file_id = d_file.id, term = term, content = content)
session.add(dm)
session.commit()
db.add(dm)
db.commit()

# export
# formulate sql query filtering for the current strategy
sql = select(DbStrategy.id, DbProcessedFile.id, DbProcessedFile.filename, DbMatch.term, DbMatch.content)\
.join(DbStrategy.processed_files).join(DbProcessedFile.matches)\
.filter(DbStrategy.id == d_strategy.id)

# translate into pandas dataframe
df = pd.read_sql_query(sql=sql, con=db.connection())

# pivot to wide
df_wide = df.pivot(index=['filename'], columns='term', values='content')

# pass wide dataframe to exporter
exp = exporter(df_wide)

# set separator for csv only
if cs.export_format == "csv":
exporter.separator = cs.export_csv_divider

# write to file
exp.export(cs.export_path)

def main(config_file: str, db_file: str):
config = read_config_file(config_file)
engine = create_engine(f'sqlite:///{db_file}')
Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()
db = Session()

# processor options according to file_format
# processor and exporter options according to file_format and export_format
file_format_options = {"pdf": PDFProcessor, "txt": TXTProcessor}
export_format_options = {"csv": CSVExporter, "xlsx": XLSXExporter, "html": HTMLExporter, "json": JSONExporter}

# save run info
run = DbRun(title = config.title,
yml_filename = config_file,
yml_sha256 = file_sha256(config_file))
session.add(run)
session.commit()
db.add(run)
db.commit()

for strategy_name, strategy in config.strategies.items():
# processor is chosen according to strategy.file_format
processor = file_format_options[strategy.file_format]
# exporter is chosen according to strategy.export_format
exporter = export_format_options[strategy.export_format]
# process using correct processor
process_strategy(strategy_name, strategy, session, run.id, processor)
process_strategy(strategy_name, strategy, db, run.id, processor, exporter)

session.close()
db.close()

def cli():
parser = argparse.ArgumentParser(description="A package for picking the juciest text morsels out of a pile of documents.")
Expand Down

0 comments on commit a6a832e

Please sign in to comment.