Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QoL changes to launching server and other fixes #66

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @eimrek @ml-evs
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
pre-commit run --all-files

- name: Run tests
run: pytest -vv --cov-report=xml --cov-report=term ./tests
run: pytest -vv --cov=./src/optimade_maker --cov-report=xml --cov-report=term ./tests

- name: Upload coverage
uses: codecov/codecov-action@v3
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@

# <div align="center">optimade-maker</div>

<div align="center">

[![PyPI - Version](https://img.shields.io/pypi/v/optimade-maker?color=4CC61E)](https://pypi.org/project/optimade-maker/)
![PyPI - License](https://img.shields.io/pypi/l/optimade-maker?color=blue)
![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/materialscloud-org/optimade-maker/ci.yml)

</div>

Tools for making [OPTIMADE APIs](https://optimade.org) from various formats of structural data (e.g. an archive of CIF files).

Expand Down
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "optimade-maker"
description = "Tools for making OPTIMADE APIs from raw structural data."
readme = "README.md"
version = "0.3.0"
requires-python = ">=3.10"
requires-python = ">= 3.10, < 3.13"
license = { text = "MIT" }
keywords = ["optimade", "jsonapi", "materials"]

Expand All @@ -24,15 +24,14 @@ dependencies = [
"pyyaml~=6.0",
"pymatgen>=2023.9",
"pandas >= 1.5, < 3",
"pybtex~=0.24",
"tqdm~=4.65",
"requests~=2.31",
"numpy >= 1.22, < 3",
"click~=8.1"
]

[project.optional-dependencies]
tests = ["pytest~=7.4", "pytest-cov~=4.0"]
tests = ["pytest~=8.3", "pytest-cov~=6.0"]
dev = ["black", "ruff", "pre-commit", "mypy", "isort"]

[tool.ruff]
Expand Down
10 changes: 7 additions & 3 deletions src/optimade_maker/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@ def cli():
type=click.Path(),
help="The path to write the JSONL file to.",
)
@click.option(
"--limit",
type=int,
help="Limit the ingestion to a fixed number of structures (useful for testing)."
)
@click.argument(
"path",
type=click.Path(),
)
def convert(jsonl_path, path):
def convert(jsonl_path, path, limit=None):
"""
Convert a raw data archive into OPTIMADE JSONL.

Expand All @@ -38,8 +43,7 @@ def convert(jsonl_path, path):
jsonl_path = Path(jsonl_path)
if jsonl_path.exists():
raise FileExistsError(f"File already exists at {jsonl_path}.")

convert_archive(Path(path), jsonl_path=jsonl_path)
convert_archive(Path(path), jsonl_path=jsonl_path, limit=limit)


@cli.command()
Expand Down
24 changes: 18 additions & 6 deletions src/optimade_maker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .config import Config, EntryConfig, JSONLConfig, ParsedFiles, PropertyDefinition
from .parsers import ENTRY_PARSERS, OPTIMADE_CONVERTERS, PROPERTY_PARSERS, TYPE_MAP

PROVIDER_PREFIX = os.environ.get("optimake_PROVIDER_PREFIX", "optimake")
PROVIDER_PREFIX = os.environ.get("OPTIMAKE_PROVIDER_PREFIX", "optimake")


def _construct_entry_type_info(
Expand Down Expand Up @@ -55,13 +55,14 @@ def _construct_entry_type_info(
return EntryInfoResource(**info)


def convert_archive(archive_path: Path, jsonl_path: Path | None = None) -> Path:
def convert_archive(archive_path: Path, jsonl_path: Path | None = None, limit: int | None = None) -> Path:
"""Convert an MCloud entry to an OPTIMADE JSONL file.

Parameters:
archive_path: The location of the `optimade.yaml` file to convert.
jsonl_path: The location to write the JSONL file to. If not provided,
write to `<archive_path>/optimade.jsonl`.
limit: The maximum number of entries to parse (useful for testing).

Raises:
FileNotFoundError: If any of the data paths in the config file,
Expand Down Expand Up @@ -106,7 +107,7 @@ def convert_archive(archive_path: Path, jsonl_path: Path | None = None) -> Path:

for entry in mc_config.entries:
optimade_entries[entry.entry_type].extend(
construct_entries(archive_path, entry, PROVIDER_PREFIX).values()
construct_entries(archive_path, entry, PROVIDER_PREFIX, limit=limit).values()
)

property_definitions = defaultdict(list)
Expand Down Expand Up @@ -227,21 +228,31 @@ def _parse_entries(
archive_path: Path,
matches_by_file: dict[str | None, list[Path]],
entry_type: str,
limit: int | None = None,
) -> tuple[list[Any], list[str]]:
"""Loop through the matches by file and parse them into
the intermediate format, also generating IDs for each.

Parameters:
archive_path: The path to the archive.
matches_by_file: A dictionary of matches by file.
entry_type: The type of entry to parse.
limit: The maximum number of entries to parse

Returns:
A list of parsed entries and a list of IDs.

"""
parsed_entries = []
entry_ids: list[str] = []
for archive_file in matches_by_file:
for _path in tqdm.tqdm(
for ind, _path in enumerate(tqdm.tqdm(
matches_by_file[archive_file],
desc=f"Parsing {entry_type} files",
):
)):
if limit and ind >= limit:
break

path_in_archive: Path = Path(_path).relative_to(Path(archive_path))
exceptions = {}

Expand Down Expand Up @@ -421,7 +432,7 @@ def _parse_and_assign_properties(


def construct_entries(
archive_path: Path, entry_config: EntryConfig, provider_prefix: str
archive_path: Path, entry_config: EntryConfig, provider_prefix: str, limit: int | None = None,
) -> dict[str, dict]:
"""Given an archive path and an entry specification,
loop through the provided paths and try to ingest them
Expand Down Expand Up @@ -453,6 +464,7 @@ def construct_entries(
archive_path,
entry_matches_by_file,
entry_config.entry_type,
limit=limit,
)

# Generate a better set of entry IDs
Expand Down
7 changes: 0 additions & 7 deletions src/optimade_maker/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import ase.io
import pandas
import pybtex.database
import pymatgen.core
import pymatgen.entries.computed_entries
from optimade.adapters import Structure
Expand All @@ -13,10 +12,6 @@
from optimade_maker.config import PropertyDefinition


def pybtex_to_optimade(bib_entry: Any, properties=None) -> EntryResource:
raise NotImplementedError


def load_csv_file(
p: Path,
properties: list[PropertyDefinition] | None = None,
Expand Down Expand Up @@ -118,7 +113,6 @@ def _wrapped_json_parser(path: Path) -> Any:
),
wrapped_json_parser(pymatgen.core.Structure.from_dict),
],
"references": [pybtex.database.parse_file],
}


Expand Down Expand Up @@ -159,5 +153,4 @@ def structure_ingest_wrapper(entry, properties=None): # type: ignore
str, list[Callable[[Any, list[PropertyDefinition] | None], EntryResource | dict]]
] = {
"structures": [structure_ingest_wrapper, parse_computed_structure_entry],
"references": [pybtex_to_optimade],
}
40 changes: 37 additions & 3 deletions src/optimade_maker/serve.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import traceback
import warnings
from pathlib import Path

import bson.json_util
Expand Down Expand Up @@ -74,8 +75,12 @@ def _read_custom_fields(properties, info_type):

with open(jsonl_path, "r") as fhandle:
try:
for json_str in fhandle:
entry = bson.json_util.loads(json_str)
for line_no, json_str in enumerate(fhandle):
try:
entry = bson.json_util.loads(json_str)
except json.JSONDecodeError:
warnings.warn(f"Found bad JSONL line at L{line_no}")
continue

if "properties" in entry:
if "type" not in entry:
Expand All @@ -89,6 +94,13 @@ def _read_custom_fields(properties, info_type):
if entry["type"] == "info":
_read_custom_fields(entry["properties"], entry["id"])

elif "x-optimade" in entry:
continue
# If this isn't an info endpoint, or the first line header, then we break
# as presumably we have reached the data itself
else:
break

except Exception as exc:
traceback.print_exc()
print(f"Error {exc}")
Expand All @@ -101,9 +113,18 @@ class OptimakeServer:
Uses the MongoMock backend.
"""

def __init__(self, path: Path, port: int = 5000):
def __init__(self, path: Path, port: int = 5000, **config_kws):
"""Initialise the OptimakeServer instance.

Parameters:
path: Path to the directory containing the optimade.jsonl file.
port: Port to run the API on.
config_kws: Additional optimade-python-tools configuration options to pass to the API.

"""
self.path = path
self.port = port
self.config_kws = config_kws

self.base_url = f"http://localhost:{self.port}"
# self.index_base_url = "http://localhost:5001"
Expand All @@ -119,13 +140,26 @@ def get_optimade_config(self):
"debug": False,
"insert_test_data": False,
"insert_from_jsonl": str(jsonl_path.resolve()),
"create_default_index": True,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be the one bit that is problematic @eimrek -- I added the ability to create a unique ID index to optimade-python-tools which is currently unreleased, when it gets released, serve will start making a unique index by default, which may conflict with any existing indexes you have... We could always set this to false by default to keep the old behaviour, and user's can now pass it as an env var anyway.

"base_url": self.base_url,
"provider": get_optimake_provider_info(),
# "index_base_url": self.index_base_url,
"provider_fields": provider_fields,
"log_dir": str(self.path.resolve()),
}

config_dict.update(self.config_kws)

# Loop through any environment variables that start with "OPTIMAKE_" and set them
for env in os.environ:
if env.startswith("OPTIMAKE_"):
LOGGER.debug(
"Reading environment variable %s into config with value %s",
env,
os.environ[env],
)
config_dict[env.replace("OPTIMAKE_", "").lower()] = os.environ[env]

LOGGER.debug(f"CONFIG: {config_dict}")

return config_dict
Expand Down
Loading