diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 000000000..3ebbf5504 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,22 @@ +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/.gitignore b/.gitignore index 0ed907210..98a45a25d 100644 --- a/.gitignore +++ b/.gitignore @@ -141,6 +141,7 @@ tests/output/eval-* tasks/ training/ +!tests/input/training/ preserved/ random LOG diff --git a/Makefile b/Makefile index 8a1a1afcb..347a6cccb 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ gh-deploy: all_recipes: tests/output/owl/merged/recipe-all-merged.owl # prefix with 'web' for a URL in recipe-urls.csv -# prefix wiyth 'case' for a previously downloaded recipe in cases/ directory +# prefix with 'case' for a previously downloaded recipe in cases/ directory RECIPES = case-spaghetti case-egg-noodles case-tortilla-soup \ web-spinach-and-feta-turkey-burgers \ web-shrimp-and-cheesy-grits-with-bacon \ diff --git a/docs/custom.md b/docs/custom.md index 42b2f6790..bc2ef3340 100644 --- a/docs/custom.md +++ b/docs/custom.md @@ -469,4 +469,4 @@ For example, if your schema is named `albatross.yaml`, then an extract command i ontogpt extract -t albatross.yaml -i input.txt ``` -Running this (or any other command including your custom schema) will install it for future use with OntoGPT, so in subsquent commands it can be referred to by its name (e.g., `albatross`, without the file extension or a full filepath). +Running this (or any other command including your custom schema) will install it for future use with OntoGPT, so in subsequent commands it can be referred to by its name (e.g., `albatross`, without the file extension or a full filepath). diff --git a/docs/functions.md b/docs/functions.md index cadec94e6..536d5c9a6 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -194,7 +194,7 @@ Including an instruction like the following anecdotally helps to avoid parsing f ### selectcols -Use the option `selectcols` to specify exact colums to use when parsing tabular files as input. +Use the option `selectcols` to specify exact columns to use when parsing tabular files as input. Example: diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 473a335e5..f5eb4094b 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -20,7 +20,7 @@ OntoGPT uses `oaklib` to handle the ontologies it uses as annotators, and `oakli To change the download location, set the `PYSTOW_HOME` variable in your environment to your preferred path. -For example, to save downloads to `/tmp/oaklib`, set the varible like this: +For example, to save downloads to `/tmp/oaklib`, set the variable like this: ```bash export PYSTOW_HOME='/tmp/' diff --git a/notebooks/BioEPIC_demo.ipynb b/notebooks/BioEPIC_demo.ipynb index 7780d5358..d6288f4a6 100644 --- a/notebooks/BioEPIC_demo.ipynb +++ b/notebooks/BioEPIC_demo.ipynb @@ -18,7 +18,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The following examples demonstrate basic functionality of OntoGPT and the SPIRES method for extracting and integrating data (i.e., concepts and relationships) from texts in the envrionmental and earth science domains.\n", + "The following examples demonstrate basic functionality of OntoGPT and the SPIRES method for extracting and integrating data (i.e., concepts and relationships) from texts in the environmental and earth science domains.\n", "These examples assume use of the LBNL CBORG computing resource." ] }, @@ -225,7 +225,7 @@ " A semicolon-separated list of variables measured in\n", " environmental and earth science research. Examples\n", " include: root shape, biomass, water turbidity\n", - " equipments:\n", + " equipment:\n", " range: Equipment\n", " description: >-\n", " A semicolon-separated list of equipment used in\n", diff --git a/pyproject.toml b/pyproject.toml index ae331680a..db7a05348 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,3 +104,9 @@ reverse_relative = true [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] build-backend = "poetry_dynamic_versioning.backend" + +[tool.codespell] +skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv,./tests/input,old' +# some specific phrases, variables and mixed case (CamelCase etc) +ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:' +ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile' diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 13dc83a52..c6705656e 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -1208,7 +1208,7 @@ def synonyms( ontogpt synonyms -m ollama/llama3 --context "political" "abdicate" - ontogpt synonyms -m ollama/llama3 --context "biological" "dessicate" + ontogpt synonyms -m ollama/llama3 --context "biological" "desiccate" """ logging.info(f"Creating for {term}") @@ -2207,7 +2207,7 @@ def list_models(): Max Tokens: Token limit for the model. Note that models may tokenize text differently and calculate input and/or output tokens - in particular ways, so consult a model's original documentaion for + in particular ways, so consult a model's original documentation for further details. """ models = get_model_cost_map("") diff --git a/src/ontogpt/clients/pubmed_client.py b/src/ontogpt/clients/pubmed_client.py index 21e48e89f..251700726 100644 --- a/src/ontogpt/clients/pubmed_client.py +++ b/src/ontogpt/clients/pubmed_client.py @@ -183,7 +183,7 @@ def text( :param ids: List of PubMed IDs, or string with single PMID :param raw: if True, do not parse the xml, just return the raw output with tags :param autoformat: if True include title and abstract concatenated - :param pubmedcentral: if True, retreive text from PubMed Central where possible + :param pubmedcentral: if True, retrieve text from PubMed Central where possible :return: the text of a single entry, or a list of strings for text of multiple entries """ batch_size = 200 diff --git a/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py b/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py index 65cc23753..b5fd3a6ab 100644 --- a/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py +++ b/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py @@ -41,7 +41,7 @@ def _fix_source_mechanism(mechanism_dict: dict) -> dict: g["id"] = g["_id"] del g["_id"] # normalize alt_ids - bad_fields = ["all_id", "alt_name", "alt-name", "comemt", "comemnt"] + bad_fields = ["all_id", "alt_name", "alt-name", "comemt", "comment"] for n in mechanism_dict["nodes"]: if "alt_ids" in n and isinstance(n["alt_ids"], str): n["alt_ids"] = [n["alt_ids"]] diff --git a/src/ontogpt/templates/dietitian_notes.yaml b/src/ontogpt/templates/dietitian_notes.yaml index a233ffbac..22db3944d 100644 --- a/src/ontogpt/templates/dietitian_notes.yaml +++ b/src/ontogpt/templates/dietitian_notes.yaml @@ -294,7 +294,7 @@ classes: range: string # TODO: distinguish whether this is currently active therapy - # or a reccomendation for future therapy (but not yet started) + # or a recommendation for future therapy (but not yet started) TherapeuticMaterial: description: >- A specific material added to a patient's diet or diff --git a/src/ontogpt/templates/ecosim_methods.py b/src/ontogpt/templates/ecosim_methods.py index b3280f677..3ee82faab 100644 --- a/src/ontogpt/templates/ecosim_methods.py +++ b/src/ontogpt/templates/ecosim_methods.py @@ -198,7 +198,7 @@ class TermSet(NamedEntity): locations: Optional[List[str]] = Field(None, description="""A semicolon-separated list of research locations. Examples include: Vermont, New York City, Ethiopia""", json_schema_extra = { "linkml_meta": {'alias': 'locations', 'domain_of': ['TermSet']} }) methods: Optional[List[str]] = Field(None, description="""A semicolon-separated list of methods used in environmental and earth science research. Examples include: sampling, spectroscopy""", json_schema_extra = { "linkml_meta": {'alias': 'methods', 'domain_of': ['TermSet']} }) variables: Optional[str] = Field(None, description="""A semicolon-separated list of variables measured in environmental and earth science research. Examples include: root shape, biomass, water turbidity""", json_schema_extra = { "linkml_meta": {'alias': 'variables', 'domain_of': ['TermSet']} }) - equipments: Optional[str] = Field(None, description="""A semicolon-separated list of equipment used in environmental and earth science research.""", json_schema_extra = { "linkml_meta": {'alias': 'equipments', 'domain_of': ['TermSet']} }) + equipment: Optional[str] = Field(None, description="""A semicolon-separated list of equipment used in environmental and earth science research.""", json_schema_extra = { "linkml_meta": {'alias': 'equipment', 'domain_of': ['TermSet']} }) equipment_to_variable_relationships: Optional[List[EquipmentMeasuresVariable]] = Field(None, description="""A semicolon separated list of relationships between specific equipment and variables they are used to measure as described in the input. Example: NMR spectrometer was used to measure chemical content""", json_schema_extra = { "linkml_meta": {'alias': 'equipment_to_variable_relationships', 'domain_of': ['TermSet']} }) id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, diff --git a/src/ontogpt/templates/ecosim_methods.yaml b/src/ontogpt/templates/ecosim_methods.yaml index 5900af1e7..031f201ab 100644 --- a/src/ontogpt/templates/ecosim_methods.yaml +++ b/src/ontogpt/templates/ecosim_methods.yaml @@ -42,7 +42,7 @@ classes: A semicolon-separated list of variables measured in environmental and earth science research. Examples include: root shape, biomass, water turbidity - equipments: + equipment: range: Equipment description: >- A semicolon-separated list of equipment used in diff --git a/src/ontogpt/templates/pathology.py b/src/ontogpt/templates/pathology.py index 71b8a645e..cdb85036d 100644 --- a/src/ontogpt/templates/pathology.py +++ b/src/ontogpt/templates/pathology.py @@ -318,7 +318,7 @@ class PathologyReport(ConfiguredBaseModel): """ linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/pathology', 'tree_root': True}) - pathology_statements: Optional[List[PathologyStatement]] = Field(None, description="""A semicolon-delimited list of pathology statements, each describing a pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the negation should be included in each statment, e.g., \"no granulomas or viropathic changes\" should become \"no granulomas\" and \"no viropathic changes\".""", json_schema_extra = { "linkml_meta": {'alias': 'pathology_statements', 'domain_of': ['PathologyReport']} }) + pathology_statements: Optional[List[PathologyStatement]] = Field(None, description="""A semicolon-delimited list of pathology statements, each describing a pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the negation should be included in each statement, e.g., \"no granulomas or viropathic changes\" should become \"no granulomas\" and \"no viropathic changes\".""", json_schema_extra = { "linkml_meta": {'alias': 'pathology_statements', 'domain_of': ['PathologyReport']} }) is_benign: Optional[str] = Field(None, description="""Whether the overall pathology appears to be benign and not malignant. Other pathologies may be present, but if tissue is described as benign and/or if a carcinoma is explicitly excluded, this value should be true. A statement of \"no significant pathologic abnormality\" or the short form \"nspa\" would also have a value of true. It it otherwise 'unclear'.""", json_schema_extra = { "linkml_meta": {'alias': 'is_benign', 'annotations': {'prompt.example': {'tag': 'prompt.example', 'value': 'true, false, unclear'}}, diff --git a/src/ontogpt/templates/pathology.yaml b/src/ontogpt/templates/pathology.yaml index 150a3cbe6..b1a343f8a 100644 --- a/src/ontogpt/templates/pathology.yaml +++ b/src/ontogpt/templates/pathology.yaml @@ -43,7 +43,7 @@ classes: pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the - negation should be included in each statment, e.g., "no granulomas or + negation should be included in each statement, e.g., "no granulomas or viropathic changes" should become "no granulomas" and "no viropathic changes". range: PathologyStatement diff --git a/src/ontogpt/templates/recipe.yaml b/src/ontogpt/templates/recipe.yaml index 04f2646c3..49b97c376 100644 --- a/src/ontogpt/templates/recipe.yaml +++ b/src/ontogpt/templates/recipe.yaml @@ -18,7 +18,7 @@ prefixes: qudt: http://qudt.org/schema/qudt/ dbpediaont: http://dbpedia.org/ontology/ -# This template incorportates syntax from +# This template incorporates syntax from # linkml-owl to define OWL interpretations # and enable advanced functionality. # https://linkml.io/linkml-owl/templates/ diff --git a/src/ontogpt/utils/pymupdf_helpers.py b/src/ontogpt/utils/pymupdf_helpers.py index 9b909c2c7..bec15c643 100644 --- a/src/ontogpt/utils/pymupdf_helpers.py +++ b/src/ontogpt/utils/pymupdf_helpers.py @@ -106,7 +106,7 @@ def fonts(doc, granularity=False): def font_tags(font_counts, styles): """Return dictionary with font sizes as keys and tags as value. - :param font_counts: (font_size, count) for all fonts occuring in document + :param font_counts: (font_size, count) for all fonts occurring in document :type font_counts: list :param styles: all styles found in the document :type styles: dict