Skip to content

Commit

Permalink
Merge branch 'main' into urdu-verb-query
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis authored Oct 10, 2024
2 parents 7db43fd + 5ddb577 commit c51da81
Show file tree
Hide file tree
Showing 138 changed files with 3,866 additions and 396 deletions.
20 changes: 12 additions & 8 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# OS Files
##########
# MARK: OS Files

.DS_Store
.vscode/*
!.vscode/extensions.json
Expand All @@ -8,8 +8,8 @@
*wiki_partitions
*wiki.ndjson

# Python Files
##############
# MARK: Python Files

# setup.py working directory
build
# setup.py dist directory
Expand All @@ -24,11 +24,15 @@ __pycache__
venv
.venv

# NPM Files
###########
# MARK: NPM Files

node_modules
package-lock.json

# Intermerdiary Data Files
##########################
# MARK: Intermerdiary Files

**/*_queried.json

# MARK: Test Files

tests_output
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
- Total Wikidata lexemes for languages and data types can be derived with the `--total` command ([#147](https://github.com/scribe-org/Scribe-Data/issues/147)).
- Commands can be used via an interactive mode with the `--interactive` command ([#158](https://github.com/scribe-org/Scribe-Data/issues/158)).
- Articles are removed from machine translations so they're more directly useful in Scribe applications ([#96](https://github.com/scribe-org/Scribe-Data/issues/96)).
- Queries for Basque verbs and adjectives were expanded and added respectively ([#222](https://github.com/scribe-org/Scribe-Data/issues/222)).
- The query for Danish verbs was expanded ([#225](https://github.com/scribe-org/Scribe-Data/issues/225)).

### 🐞 Bug Fixes

Expand All @@ -44,6 +46,7 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
- Many files were renamed including `update_data.py` being renamed `query_data.py`
- Paths within the package have been updated to work for all operating systems via `pathlib` ([#125](https://github.com/scribe-org/Scribe-Data/issues/125)).
- The language formatting scripts have been dramatically simplified given changes to export paths all being the same.
- The `update_files` directory was removed in preparation of other means of showing data totals.

## Scribe-Data 3.3.0

Expand Down
2 changes: 1 addition & 1 deletion docs/source/scribe_data/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Behavior and Output:

.. code-block:: text
Updating data for language: English, data type: ['verbs']
Updating data for language(s): English; data type(s): verbs
Data updated: 0%|
2. If existing files are found, you'll be prompted to choose an option:
Expand Down
142 changes: 134 additions & 8 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import json
import shutil
from pathlib import Path
from typing import Optional

from scribe_data.cli.cli_utils import language_map
from scribe_data.load.data_to_sqlite import data_to_sqlite
Expand All @@ -33,10 +32,33 @@
get_language_iso,
)

# MARK: JSON


def export_json(
language: str, data_type: str, output_dir: Path, overwrite: bool
) -> None:
"""
Export a JSON file from the CLI process.
Parameters
----------
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A JSON file saved in the given location.
"""
normalized_language = language_map.get(language.lower())

if not normalized_language:
Expand All @@ -56,7 +78,7 @@ def export_json(
return

try:
with data_file.open("r") as file:
with data_file.open("r", encoding="utf-8") as file:
data = json.load(file)

except (IOError, json.JSONDecodeError) as e:
Expand Down Expand Up @@ -85,9 +107,40 @@ def export_json(
)


# MARK: CSV or TSV


def convert_to_csv_or_tsv(
language: str, data_type: list, output_dir: Path, overwrite: bool, output_type: str
language: str,
data_type: list,
output_dir: Path,
overwrite: bool,
output_type: str,
) -> None:
"""
Converts a Scribe-Data output file to a CSV or TSV file.
Parameters
----------
output_type : str
The file type to convert to (CSV or TSV).
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A CSV or TSV file saved in the given location.
"""
normalized_language = language_map.get(language.lower())
if not normalized_language:
print(f"Language '{language}' is not recognized.")
Expand All @@ -110,7 +163,7 @@ def convert_to_csv_or_tsv(
)

try:
with file_path.open("r") as f:
with file_path.open("r", encoding="utf-8") as f:
data = json.load(f)

except (IOError, json.JSONDecodeError) as e:
Expand Down Expand Up @@ -154,12 +207,36 @@ def convert_to_csv_or_tsv(
print(f"Data for '{dtype}' written to '{output_file}'")


# MARK: SQLITE


def convert_to_sqlite(
language: Optional[str] = None,
data_type: Optional[str] = None,
output_dir: Optional[str] = None,
overwrite: bool = False,
language: str,
data_type: str,
output_dir: Path,
overwrite: bool,
) -> None:
"""
Converts a Scribe-Data output file to an SQLite file.
Parameters
----------
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A SQLite file saved in the given location.
"""
if not language:
raise ValueError("Language must be specified for SQLite conversion.")

Expand Down Expand Up @@ -191,3 +268,52 @@ def convert_to_sqlite(

else:
print("No output directory specified. SQLite file remains in default location.")


# MARK: Convert


def convert(
language: str, data_type: str, output_dir: str, overwrite: bool, output_type: str
):
"""
Converts a Scribe-Data output file to a different file type.
Parameters
----------
output_type : str
The file type to convert to (CSV or TSV).
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A SQLite file saved in the given location.
"""
if output_dir:
output_dir = Path(output_dir).resolve()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

if output_type == "json" or output_type is None:
export_json(language, data_type, output_dir, overwrite)

elif output_type in {"csv", "tsv"}:
convert_to_csv_or_tsv(
language, data_type, output_dir, overwrite, output_type
)

else:
raise ValueError(
"Unsupported output type. Please use 'json', 'csv', or 'tsv'."
)
Loading

0 comments on commit c51da81

Please sign in to comment.