diff --git a/examples/batch_process.py b/examples/batch_process.py index 865ad0f..858d0ac 100644 --- a/examples/batch_process.py +++ b/examples/batch_process.py @@ -1,31 +1,22 @@ -from collections import Counter -from pathlib import Path +import logging import time +from pathlib import Path -import typer -import pandas as pd -import spacy -from spacy.language import Language -from spacy.tokens import Token, Doc -from spacy.pipeline import Pipe -import numpy as np import jsonlines -from tqdm import tqdm -import re -import torch +import numpy as np import pandas as pd +import spacy +import torch +import typer +from torch.utils.data import DataLoader +from tqdm import tqdm +from utilities import spacy_doc_setup -from torch.utils.data import Dataset, DataLoader import mordecai3.elastic_utilities as es_util from mordecai3.geoparse import doc_to_ex_expanded +from mordecai3.roberta_qa import setup_qa from mordecai3.torch_model import ProductionData, geoparse_model -from mordecai3.roberta_qa import setup_qa -from utilities import spacy_doc_setup -from elasticsearch import Elasticsearch, helpers -from elasticsearch_dsl import Search, Q - -import logging logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( diff --git a/examples/batch_process_production.py b/examples/batch_process_production.py index ddc5e12..c2d5ff2 100644 --- a/examples/batch_process_production.py +++ b/examples/batch_process_production.py @@ -1,27 +1,23 @@ -from collections import Counter -from pathlib import Path +import logging +import re import time from configparser import ConfigParser +from pathlib import Path -import typer -import pandas as pd -import spacy -import numpy as np import jsonlines -from tqdm import tqdm -import re -import torch +import numpy as np import pandas as pd - +import spacy +import torch +import typer from torch.utils.data import DataLoader +from tqdm import tqdm + import mordecai3.elastic_utilities as es_util from mordecai3.geoparse import Geoparser, doc_to_ex_expanded from mordecai3.torch_model import ProductionData -from mordecai3.roberta_qa import setup_qa from mordecai3.utilities import spacy_doc_setup - -import logging logger = logging.getLogger(__name__) handler = logging.StreamHandler() formatter = logging.Formatter( diff --git a/examples/production_event.py b/examples/production_event.py index 7c3307e..c9fcfaf 100644 --- a/examples/production_event.py +++ b/examples/production_event.py @@ -1,22 +1,19 @@ -import json - +import numpy as np +import pandas as pd +import spacy import streamlit as st import torch -import pandas as pd from elasticsearch import Elasticsearch -from elasticsearch_dsl import Search, Q -import spacy -from spacy.tokens import Token, Doc -from spacy.pipeline import Pipe -import numpy as np +from elasticsearch_dsl import Search from torch.utils.data import DataLoader import mordecai3.elastic_utilities as es_util from mordecai3.geoparse import doc_to_ex_expanded +from mordecai3.roberta_qa import add_event_loc, setup_qa from mordecai3.torch_model import ProductionData, geoparse_model -from mordecai3.roberta_qa import setup_qa, add_event_loc from mordecai3.utilities import spacy_doc_setup + # for dumping raw output to JSON # https://stackoverflow.com/a/52604722 def default(obj): diff --git a/mordecai3/elastic_utilities.py b/mordecai3/elastic_utilities.py index 082ccc8..17fd578 100644 --- a/mordecai3/elastic_utilities.py +++ b/mordecai3/elastic_utilities.py @@ -1,12 +1,13 @@ -from elasticsearch import Elasticsearch, helpers -from elasticsearch_dsl import Search, Q -import numpy as np -import jellyfish -from collections import Counter -import warnings +import logging import re +import warnings +from collections import Counter + +import jellyfish +import numpy as np +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Q, Search -import logging logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) diff --git a/mordecai3/error_analysis.py b/mordecai3/error_analysis.py index acd3b43..2d1b8ac 100644 --- a/mordecai3/error_analysis.py +++ b/mordecai3/error_analysis.py @@ -1,17 +1,17 @@ -from torch_model import TrainData, geoparse_model -from train import load_data -from torch.utils.data import DataLoader -import torch +import logging from collections import Counter -import numpy as np -from rich.console import Console -from rich.table import Table -import typer from pathlib import Path +import numpy as np +import torch +import typer from error_utils import evaluate_results, make_wandb_dict +from rich.console import Console +from rich.table import Table +from torch.utils.data import DataLoader +from torch_model import TrainData, geoparse_model +from train import load_data -import logging logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( @@ -40,7 +40,6 @@ import wandb - def make_missing_table(cutoff, names, datasets): table = Table(show_header=True, header_style="bold magenta") table.add_column("Dataset") diff --git a/mordecai3/error_utils.py b/mordecai3/error_utils.py index 8926d21..260f7c1 100644 --- a/mordecai3/error_utils.py +++ b/mordecai3/error_utils.py @@ -1,6 +1,6 @@ -import torch -import numpy as np import haversine as hs +import numpy as np +import torch #es_data = datasets[2] #loader = data_loaders[2] diff --git a/mordecai3/geoparse.py b/mordecai3/geoparse.py index af9165d..e889d27 100644 --- a/mordecai3/geoparse.py +++ b/mordecai3/geoparse.py @@ -1,24 +1,24 @@ -import jsonlines -from tqdm import tqdm -import re +import logging import os +import re -import torch -import pandas as pd -import spacy -from spacy.language import Language -from spacy.tokens import Token, Span, Doc -from spacy.pipeline import Pipe import numpy as np -from torch.utils.data import Dataset, DataLoader import pkg_resources - -from mordecai3.elastic_utilities import make_conn, get_entry_by_id, get_adm1_country_entry, get_country_entry, add_es_data_doc -from mordecai3.torch_model import ProductionData, geoparse_model -from mordecai3.roberta_qa import setup_qa, add_event_loc +import spacy +import torch +from torch.utils.data import DataLoader + +from mordecai3.elastic_utilities import ( + add_es_data_doc, + get_adm1_country_entry, + get_country_entry, + get_entry_by_id, + make_conn, +) from mordecai3.mordecai_utilities import spacy_doc_setup +from mordecai3.roberta_qa import add_event_loc, setup_qa +from mordecai3.torch_model import ProductionData, geoparse_model -import logging logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( diff --git a/mordecai3/mordecai_streamlit.py b/mordecai3/mordecai_streamlit.py index 13c93f0..b29af0a 100644 --- a/mordecai3/mordecai_streamlit.py +++ b/mordecai3/mordecai_streamlit.py @@ -1,21 +1,12 @@ -import spacy -from spacy.language import Language -from spacy.tokens import Token, Doc -from spacy.pipeline import Pipe import numpy as np -import jsonlines -from tqdm import tqdm -import re +import spacy import streamlit as st import torch -from torch.utils.data import Dataset, DataLoader - -from elastic_utilities import res_formatter, add_es_data -from torch_model import ProductionData, geoparse_model -from geoparse import doc_to_ex_expanded, Geoparser - -from elasticsearch import Elasticsearch, helpers -from elasticsearch_dsl import Search, Q +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search +from spacy.language import Language +from spacy.tokens import Token +from torch_model import geoparse_model HTML_WRAPPER = """
{}
""" diff --git a/mordecai3/mordecai_utilities.py b/mordecai3/mordecai_utilities.py index d12328f..0578667 100644 --- a/mordecai3/mordecai_utilities.py +++ b/mordecai3/mordecai_utilities.py @@ -1,6 +1,6 @@ -from spacy.tokens import Token -from spacy.language import Language import numpy as np +from spacy.language import Language +from spacy.tokens import Token #def make_country_dict(): # country = pd.read_csv("assets/wikipedia-iso-country-codes.txt") diff --git a/mordecai3/roberta_qa.py b/mordecai3/roberta_qa.py index 0ab0613..c210f1b 100644 --- a/mordecai3/roberta_qa.py +++ b/mordecai3/roberta_qa.py @@ -1,5 +1,5 @@ -from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline from spacy.tokens import Span +from transformers import pipeline model_name = "deepset/roberta-base-squad2" diff --git a/mordecai3/tests/conftest.py b/mordecai3/tests/conftest.py index b288824..e3ba52b 100644 --- a/mordecai3/tests/conftest.py +++ b/mordecai3/tests/conftest.py @@ -1,7 +1,7 @@ -from ..geoparse import Geoparser import pytest -import spacy +from ..geoparse import Geoparser + @pytest.fixture(scope='session', autouse=True) def geo(): diff --git a/mordecai3/tests/test_mordecai3.py b/mordecai3/tests/test_mordecai3.py index 81afb93..dedf35d 100644 --- a/mordecai3/tests/test_mordecai3.py +++ b/mordecai3/tests/test_mordecai3.py @@ -1,7 +1,9 @@ import pytest + from .. import elastic_utilities as es_utils from .. import geoparse + def test_statement_event_loc(geo): text = "Speaking from Berlin, President Obama expressed his hope for a peaceful resolution to the fighting in Homs and Aleppo." #text = "President Obama expressed his hope for a peaceful resolution to the fighting." diff --git a/mordecai3/torch_model.py b/mordecai3/torch_model.py index 85f65d1..f8b5368 100644 --- a/mordecai3/torch_model.py +++ b/mordecai3/torch_model.py @@ -1,15 +1,15 @@ ## Read in the BERT embedding for each place name ## and predict the country using pytorch -import numpy as np import json +import logging import os +import numpy as np import torch import torch.nn as nn -from torch.utils.data import Dataset from pandas import read_csv +from torch.utils.data import Dataset -import logging logger = logging.getLogger(__name__) handler = logging.StreamHandler() formatter = logging.Formatter( diff --git a/mordecai3/train.py b/mordecai3/train.py index fd087c7..91358d5 100644 --- a/mordecai3/train.py +++ b/mordecai3/train.py @@ -1,36 +1,36 @@ -import random +import os import pickle +import random import re -import os + import jsonlines + os.environ['KMP_DUPLICATE_LIB_OK']='True' +import datetime +import logging + +import elastic_utilities as es_util import numpy as np -from tqdm import tqdm +import spacy import torch -import torch.optim as optim import torch.nn as nn -from torch.utils.data import DataLoader -import xmltodict -import wandb +import torch.optim as optim import typer -import spacy -from spacy.tokens import DocBin -import datetime -import multiprocessing +import wandb +import xmltodict +from error_utils import make_wandb_dict +from geoparse import guess_in_rel -from torch_model import geoparse_model -import elastic_utilities as es_util # Currently getting this error: ImportError: attempted relative import with no known parent package # when I run the line below. # from .mordecai_utilities import spacy_doc_setup from mordecai_utilities import spacy_doc_setup -from torch_model import TrainData -from error_utils import make_wandb_dict -from geoparse import guess_in_rel -import elasticsearch +from spacy.tokens import DocBin +from torch.utils.data import DataLoader +from torch_model import TrainData, geoparse_model +from tqdm import tqdm -import logging logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( @@ -625,8 +625,8 @@ def train(batch_size: int = typer.Option(32, "--batch_size"), # input ba loss_func=nn.CrossEntropyLoss() # single label, multi-class optimizer = optim.Adam(model.parameters(), lr=config.lr) if config.avg_params: - from torch.optim.swa_utils import AveragedModel, SWALR from torch.optim.lr_scheduler import CosineAnnealingLR + from torch.optim.swa_utils import SWALR, AveragedModel swa_model = AveragedModel(model) scheduler = CosineAnnealingLR(optimizer, T_max=config.epochs+1) diff --git a/setup.py b/setup.py index 70ce4a9..40289bc 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup + setup( name = 'mordecai3', version='3.0.0a',