Skip to content

Commit

Permalink
Merge pull request #38 from leej3/data-updates
Browse files Browse the repository at this point in the history
Data updates
  • Loading branch information
leej3 authored Aug 23, 2024
2 parents cf218b7 + 643308d commit 04c765a
Show file tree
Hide file tree
Showing 11 changed files with 470 additions and 114 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ osm_output
.terraform
.terraform.lock.hcl
.public_dns
tempdata
7 changes: 7 additions & 0 deletions compose.development.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ services:
- ./external_components/rtransparent:/app

web_api:
container_name: web_api
environment:
- MONGODB_URI=mongodb://db:27017/test
build:
Expand All @@ -17,12 +18,14 @@ services:
- 80:80
volumes:
- ./web/api:/app/app
- ./osm:/opt/osm/osm
working_dir: /app/app
command: ["fastapi","dev","--host","0.0.0.0","--port","80"]
depends_on:
- db

dashboard:
container_name: dashboard
build:
context: .
dockerfile: ./web/dashboard/Dockerfile
Expand All @@ -31,8 +34,12 @@ services:
working_dir: /app
ports:
- "8501:8501"
volumes:
- ./web/dashboard:/app
- ./osm:/opt/osm/osm

db:
container_name: db
# use old version of mongo to avoid Apple Instruction set error
image: mongo:4.4.6
ports:
Expand Down
2 changes: 2 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
services:
sciencebeam:
container_name: sciencebeam
image: elifesciences/sciencebeam-parser
ports:
- "8070:8070"
rtransparent:
container_name: rtransparent
image: nimhdsst/rtransparent:staging
ports:
- "8071:8071"
Expand Down
98 changes: 98 additions & 0 deletions osm/schemas/custom_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Any, ClassVar, Generic, TypeVar, Union

import odmantic
from pydantic.annotated_handlers import GetCoreSchemaHandler
from pydantic.json_schema import JsonSchemaValue
from pydantic_core import CoreSchema, core_schema

T = TypeVar("T", str, bytes)


def _display(value: T) -> str:
if isinstance(value, bytes):
return b"..." if value else b""
return f"{value[:10]}..." if value else ""


class LongField(Generic[T]):
_inner_schema: ClassVar[CoreSchema]
_error_kind: ClassVar[str]

@classmethod
def __get_pydantic_core_schema__(
cls, source: type[Any], handler: GetCoreSchemaHandler
) -> CoreSchema:
def serialize(
value: "LongField[T]", info: core_schema.SerializationInfo
) -> Union[str, "LongField[T]"]:
if info.mode == "json":
return _display(value.get_value())
else:
return value

def get_json_schema(
_core_schema: CoreSchema, handler: GetCoreSchemaHandler
) -> JsonSchemaValue:
json_schema = handler(cls._inner_schema)
return json_schema

json_schema = core_schema.no_info_after_validator_function(
source, # construct the type
cls._inner_schema,
)

def get_schema(strict: bool) -> CoreSchema:
return core_schema.json_or_python_schema(
python_schema=core_schema.union_schema(
[
core_schema.is_instance_schema(source),
json_schema,
],
custom_error_type=cls._error_kind,
strict=strict,
),
json_schema=json_schema,
serialization=core_schema.plain_serializer_function_ser_schema(
serialize,
info_arg=True,
return_schema=core_schema.str_schema(),
when_used="json",
),
)

return core_schema.lax_or_strict_schema(
lax_schema=get_schema(strict=False),
strict_schema=get_schema(strict=True),
)

def __init__(self, value: T):
self._value = value

def get_value(self) -> T:
return self._value

def __repr__(self) -> str:
return '""' # Always return an empty string representation

def __str__(self) -> str:
return _display(self._value)


class LongStr(LongField[str]):
"""A string that displays '...' instead of the full content in logs or tracebacks."""

_inner_schema: ClassVar[CoreSchema] = core_schema.str_schema()
_error_kind: ClassVar[str] = "string_type"


class LongBytes(LongField[bytes]):
"""A bytes type that displays '...' instead of the full content in logs or tracebacks."""

_inner_schema: ClassVar[CoreSchema] = core_schema.bytes_schema()
_error_kind: ClassVar[str] = "bytes_type"


class FilePlaceholder(odmantic.EmbeddedModel):
content: LongBytes = odmantic.Field(
default=b"", json_schema_extra={"exclude": True}
)
95 changes: 55 additions & 40 deletions osm/schemas/metrics_schemas.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import types
from typing import Optional

from odmantic import EmbeddedModel
from pydantic import field_validator

# The rtransparent tool can extract from parsed pdfs or from XML directly from pubmed central. The latter has many more fields.
from .custom_fields import LongStr


# The rtransparent tool can extract from parsed pdfs or from XML directly from pubmed central. The latter has many more fields.
# all_indicators.csv from the rtransparent publication has both but has the following extra fields:
# code_text,com_code,com_data_availibility,com_file_formats,com_general_db,com_github_data,com_specific_db,com_suppl_code,com_supplemental_data,data_text,dataset,eigenfactor_score,field,is_art,is_code_pred,is_data_pred,is_relevant_code,is_relevant_data,jif,n_cite,score,year,
class RtransparentMetrics(EmbeddedModel):
Expand All @@ -13,19 +16,19 @@ class RtransparentMetrics(EmbeddedModel):
is_open_data: Optional[bool]

# Optional fields
year: Optional[float] = None
year: Optional[int] = None
filename: Optional[str] = None
pmcid_pmc: Optional[int] = None
pmid: Optional[float] = None
pmid: Optional[int] = None
doi: Optional[str] = None
year_epub: Optional[float] = None
year_ppub: Optional[float] = None
year_epub: Optional[int] = None
year_ppub: Optional[int] = None
journal: Optional[str] = None
publisher: Optional[str] = None
affiliation_country: Optional[str] = None
affiliation_institution: Optional[str] = None
type: Optional[str] = None
data_text: Optional[str] = None
data_text: Optional[LongStr] = None
is_relevant_data: Optional[bool] = None
com_specific_db: Optional[str] = None
com_general_db: Optional[str] = None
Expand All @@ -34,18 +37,18 @@ class RtransparentMetrics(EmbeddedModel):
com_file_formats: Optional[str] = None
com_supplemental_data: Optional[str] = None
com_data_availibility: Optional[str] = None
code_text: Optional[str] = None
code_text: Optional[LongStr] = None
is_relevant_code: Optional[bool] = None
com_code: Optional[str] = None
com_suppl_code: Optional[str] = None
is_coi_pred: Optional[bool] = None
coi_text: Optional[str] = None
coi_text: Optional[LongStr] = None
is_coi_pmc_fn: Optional[bool] = None
is_coi_pmc_title: Optional[str] = None
is_coi_pmc_title: Optional[bool] = None
is_relevant_coi: Optional[bool] = None
is_relevant_coi_hi: Optional[bool] = None
is_relevant_coi_lo: Optional[bool] = None
is_explicit_coi: Optional[str] = None
is_explicit_coi: Optional[bool] = None
coi_1: Optional[bool] = None
coi_2: Optional[bool] = None
coi_disclosure_1: Optional[bool] = None
Expand All @@ -66,7 +69,7 @@ class RtransparentMetrics(EmbeddedModel):
board_1: Optional[bool] = None
no_coi_1: Optional[bool] = None
no_funder_role_1: Optional[bool] = None
fund_text: Optional[str] = None
fund_text: Optional[LongStr] = None
fund_pmc_institute: Optional[str] = None
fund_pmc_source: Optional[str] = None
fund_pmc_anysource: Optional[str] = None
Expand Down Expand Up @@ -109,37 +112,37 @@ class RtransparentMetrics(EmbeddedModel):
acknow_1: Optional[bool] = None
disclosure_1: Optional[bool] = None
disclosure_2: Optional[bool] = None
fund_ack: Optional[str] = None
project_ack: Optional[str] = None
fund_ack: Optional[bool] = None
project_ack: Optional[bool] = None
is_register_pred: Optional[bool] = None
register_text: Optional[str] = None
register_text: Optional[LongStr] = None
is_research: Optional[bool] = None
is_review: Optional[bool] = None
is_reg_pmc_title: Optional[bool] = None
is_relevant_reg: Optional[bool] = None
is_method: Optional[bool] = None
is_NCT: Optional[bool] = None
is_explicit_reg: Optional[str] = None
prospero_1: Optional[str] = None
registered_1: Optional[str] = None
registered_2: Optional[str] = None
registered_3: Optional[str] = None
registered_4: Optional[str] = None
registered_5: Optional[str] = None
not_registered_1: Optional[str] = None
registration_1: Optional[str] = None
registration_2: Optional[str] = None
registration_3: Optional[str] = None
registration_4: Optional[str] = None
registry_1: Optional[str] = None
reg_title_1: Optional[str] = None
reg_title_2: Optional[str] = None
reg_title_3: Optional[str] = None
reg_title_4: Optional[str] = None
funded_ct_1: Optional[str] = None
ct_2: Optional[str] = None
ct_3: Optional[str] = None
protocol_1: Optional[str] = None
is_explicit_reg: Optional[bool] = None
prospero_1: Optional[bool] = None
registered_1: Optional[bool] = None
registered_2: Optional[bool] = None
registered_3: Optional[bool] = None
registered_4: Optional[bool] = None
registered_5: Optional[bool] = None
not_registered_1: Optional[bool] = None
registration_1: Optional[bool] = None
registration_2: Optional[bool] = None
registration_3: Optional[bool] = None
registration_4: Optional[bool] = None
registry_1: Optional[bool] = None
reg_title_1: Optional[bool] = None
reg_title_2: Optional[bool] = None
reg_title_3: Optional[bool] = None
reg_title_4: Optional[bool] = None
funded_ct_1: Optional[bool] = None
ct_2: Optional[bool] = None
ct_3: Optional[bool] = None
protocol_1: Optional[bool] = None
is_success: Optional[bool] = None
is_art: Optional[bool] = None
field: Optional[str] = None
Expand All @@ -150,13 +153,13 @@ class RtransparentMetrics(EmbeddedModel):
# some extra fields
affiliation_aff_id: Optional[str] = None
affiliation_all: Optional[str] = None
article: Optional[int] = None
article: Optional[str] = None
author: Optional[str] = None
author_aff_id: Optional[str] = None
correspondence: Optional[str] = None
date_epub: Optional[str] = None
date_ppub: Optional[str] = None
funding_text: Optional[str] = None
funding_text: Optional[LongStr] = None
is_explicit: Optional[bool] = None
is_fund_pred: Optional[bool] = None
is_funded_pred: Optional[bool] = None
Expand All @@ -174,9 +177,9 @@ class RtransparentMetrics(EmbeddedModel):
n_ref: Optional[str] = None
n_table_body: Optional[str] = None
n_table_floats: Optional[str] = None
open_code_statements: Optional[str] = None
open_data_category: Optional[str] = None
open_data_statements: Optional[str] = None
open_code_statements: Optional[LongStr] = None
open_data_category: Optional[LongStr] = None
open_data_statements: Optional[LongStr] = None
pii: Optional[str] = None
pmcid_uid: Optional[str] = None
publisher_id: Optional[str] = None
Expand All @@ -185,6 +188,18 @@ class RtransparentMetrics(EmbeddedModel):
is_data_pred: Optional[bool] = None
is_code_pred: Optional[bool] = None

@field_validator("article")
def coerce_to_string(cls, v):
if isinstance(v, (int, float, bool)):
return str(v)
elif isinstance(v, types.NoneType):
return None
elif not isinstance(v, str):
raise ValueError(
"string required or a type that can be coerced to a string"
)
return v


# Tried to define programmatically but both ways seemed to yield a model class without type annotated fields...

Expand Down
Loading

0 comments on commit 04c765a

Please sign in to comment.