diff --git a/ner-submission/Dockerfile b/ner-submission/Dockerfile index c2a3ed5..713147e 100644 --- a/ner-submission/Dockerfile +++ b/ner-submission/Dockerfile @@ -1,10 +1,6 @@ -# Dockerfile +# docker build -t fschlatt/authorship-verification-trivial:0.0.1 . FROM fschlatt/natural-language-processing-exercises:0.0.1 -RUN pip install sklearn-crfsuite - ADD run.py /code/run.py -ADD train.py /code/train.py -ADD model.joblib /code/model.joblib -ENTRYPOINT ["python3", "/code/run.py"] +ENTRYPOINT [ "python3", "/code/run.py" ] diff --git a/ner-submission/run.py b/ner-submission/run.py index de74262..c3038d2 100644 --- a/ner-submission/run.py +++ b/ner-submission/run.py @@ -1,72 +1,35 @@ from pathlib import Path -from joblib import load from tira.rest_api_client import Client from tira.third_party_integrations import get_output_directory -import pandas as pd -def preprocess_data(text_data): - data = [] - for i in range(len(text_data)): - sentence = text_data.iloc[i]['sentence'].split() - data.append(sentence) - return data - -def extract_features(sentence, i): - word = sentence[i] - features = { - 'word': word, - 'is_upper': word.isupper(), - 'is_title': word.istitle(), - 'is_digit': word.isdigit(), - 'suffix-3': word[-3:], - } - if i > 0: - word1 = sentence[i-1] - features.update({ - '-1:word': word1, - '-1:is_upper': word1.isupper(), - '-1:is_title': word1.istitle(), - '-1:is_digit': word1.isdigit(), - }) - else: - features['BOS'] = True - - if i < len(sentence)-1: - word1 = sentence[i+1] - features.update({ - '+1:word': word1, - '+1:is_upper': word1.isupper(), - '+1:is_title': word1.istitle(), - '+1:is_digit': word1.isdigit(), - }) +# Simple heuristic function to determine entity type based on common patterns +def simple_heuristic_token_classification(token): + if token.istitle(): + return "B-per" # Assume title case words are persons + elif token.isupper(): + return "B-org" # Assume upper case words are organizations else: - features['EOS'] = True - - return features - -def sent2features(sentence): - return [extract_features(sentence, i) for i in range(len(sentence))] + return "O" # Default to outside any named entity if __name__ == "__main__": - tira = Client() - - # Load the data - text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training") - - # Preprocess data - val_data = preprocess_data(text_validation) - X_val = [sent2features(s) for s in val_data] - # Load the model - model = load(Path(__file__).parent / "model.joblib") + tira = Client() - # Predict - y_pred = model.predict(X_val) + # loading validation data (automatically replaced by test data when run on tira) + text_validation = tira.pd.inputs( + "nlpbuw-fsu-sose-24", "ner-validation-20240612-training" + ) + targets_validation = tira.pd.truths( + "nlpbuw-fsu-sose-24", "ner-validation-20240612-training" + ) - # Save predictions + # labeling the data with simple heuristics predictions = text_validation.copy() - predictions['tags'] = [list(x) for x in y_pred] + predictions['tags'] = predictions['sentence'].apply(lambda x: [simple_heuristic_token_classification(token) for token in x.split(' ')]) predictions = predictions[['id', 'tags']] - + + # saving the prediction output_directory = get_output_directory(str(Path(__file__).parent)) - predictions.to_json(Path(output_directory) / "predictions.jsonl", orient="records", lines=True) + predictions.to_json( + Path(output_directory) / "predictions.jsonl", orient="records", lines=True + ) diff --git a/ner-submission/train.py b/ner-submission/train.py deleted file mode 100644 index 96ed753..0000000 --- a/ner-submission/train.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path -from joblib import dump -import pandas as pd -import sklearn_crfsuite -from sklearn_crfsuite import metrics -from tira.rest_api_client import Client - -def preprocess_data(text_data, labels_data): - data = [] - for i in range(len(text_data)): - sentence = text_data.iloc[i]['sentence'].split() - labels = labels_data.iloc[i]['tags'] - data.append((sentence, labels)) - return data - -def extract_features(sentence, i): - word = sentence[i] - features = { - 'word': word, - 'is_upper': word.isupper(), - 'is_title': word.istitle(), - 'is_digit': word.isdigit(), - 'suffix-3': word[-3:], - } - if i > 0: - word1 = sentence[i-1] - features.update({ - '-1:word': word1, - '-1:is_upper': word1.isupper(), - '-1:is_title': word1.istitle(), - '-1:is_digit': word1.isdigit(), - }) - else: - features['BOS'] = True - - if i < len(sentence)-1: - word1 = sentence[i+1] - features.update({ - '+1:word': word1, - '+1:is_upper': word1.isupper(), - '+1:is_title': word1.istitle(), - '+1:is_digit': word1.isdigit(), - }) - else: - features['EOS'] = True - - return features - -def sent2features(sentence): - return [extract_features(sentence, i) for i in range(len(sentence))] - -def sent2labels(sentence): - return [label for label in sentence] - -if __name__ == "__main__": - tira = Client() - - # Load the data - text_train = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-training-20240612-training") - targets_train = tira.pd.truths("nlpbuw-fsu-sose-24", "ner-training-20240612-training") - - # Preprocess data - train_data = preprocess_data(text_train, targets_train) - X_train = [sent2features(s) for s, t in train_data] - y_train = [t for s, t in train_data] - - # Train CRF model - crf = sklearn_crfsuite.CRF( - algorithm='lbfgs', - c1=0.1, - c2=0.1, - max_iterations=100, - all_possible_transitions=True - ) - crf.fit(X_train, y_train) - - # Save the model - dump(crf, Path(__file__).parent / "model.joblib")