From 2d2e1e0cf4ff956f9265a192756f5f5db0afc180 Mon Sep 17 00:00:00 2001 From: Sven Hertling Date: Sun, 10 Sep 2023 15:14:53 +0200 Subject: [PATCH] added readme and fixed typo --- examples/llm-transformers/README.md | 63 +++++++++++ .../examples/llm_transformers/CLIOptions.java | 107 +++++++++--------- .../melt/examples/llm_transformers/Main.java | 15 +-- ...SentenceTransformersPredicateBadHosts.java | 47 ++++++++ 4 files changed, 165 insertions(+), 67 deletions(-) create mode 100644 examples/llm-transformers/README.md create mode 100644 examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/SentenceTransformersPredicateBadHosts.java diff --git a/examples/llm-transformers/README.md b/examples/llm-transformers/README.md new file mode 100644 index 0000000000..fcc913981e --- /dev/null +++ b/examples/llm-transformers/README.md @@ -0,0 +1,63 @@ +# Example command line application for running an LLM + +This example provides a command line application to run a matching task using an LLM. +It is also the implementation for the paper `OLaLa: Ontology Matching with Large Language Models` + + +## Installation +To run all examples, first MELT needs to be build and then the correct python environment needs to be created. + +### Build MELT +In the main directory of MELT execute +``` +mvn clean install +``` + + +### Setup python environment + +- for pytorch 1.13 (recommended) +``` +conda create -n melt python=3.9 +conda activate melt +conda install pytorch==1.13.1 torchvision==0.14.1 pytorch-cuda=11.7 -c pytorch -c nvidia +conda install numpy scikit-learn pandas gensim flask "Werkzeug<=2.2.3" sentencepiece "protobuf==3.20.1" +conda install accelerate -c conda-forge +pip install bitsandbytes transformers sentence-transformers +``` + + +- for pytorch 2 +``` +conda create -n melt python=3.9 +conda activate melt +conda install pytorch torchvision pytorch-cuda=11.8 -c pytorch -c nvidia +conda install numpy scikit-learn pandas gensim flask "Werkzeug<=2.2.3" sentencepiece "protobuf==3.20.1" +conda install accelerate -c conda-forge +pip install bitsandbytes transformers sentence-transformers +``` + + +## Running the default configuration +The default configuration from the paper `OLaLa: Ontology Matching with Large Language Models` +can be executed with the following command: + +``` +java -jar llm-transformers-1.0-jar-with-dependencies.jar \ + --python {python executable location} \ + --transformerscache {path to transformers cache} \ + --gpu {gpus to use e.g. 1,2} \ + --prompt 7 \ + --includeloadingarguments \ + --textextractor 4 \ + --transformermodels "upstage/Llama-2-70b-instruct-v2" \ + --tracks anatomy \ + > out.txt 2> err.txt +``` + +Replace the `{python executable location}`by the path to the python exectuable from the created virtual environment above. +To get the path, activate the environment and execute `which python` (linux) or `where python` (windows). + +The path to the transformers cache (where all the models are stored) can be changed with `transformerscache` option. +Leave it out completely to used the default (usually in home folder). +The sized for `70B` variants (one models) are usually around 130 GB. diff --git a/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/CLIOptions.java b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/CLIOptions.java index 61bda8c672..c4624ae967 100644 --- a/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/CLIOptions.java +++ b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/CLIOptions.java @@ -45,29 +45,29 @@ public class CLIOptions { private static final Logger LOGGER = LoggerFactory.getLogger(CLIOptions.class); - private static final List PREDEFINED_PROMTS = createPredefinedPromts(); - private static List createPredefinedPromts(){ - List promts = new ArrayList<>(); + private static final List PREDEFINED_PROMPTS = createPredefinedPrompts(); + private static List createPredefinedPrompts(){ + List prompts = new ArrayList<>(); /******************** * zero shot ********************/ //0 - promts.add("Classify if the following two concepts are the same.\n### First concept:\n{left}\n### Second concept:\n{right}\n### Answer:\n"); + prompts.add("Classify if the following two concepts are the same.\n### First concept:\n{left}\n### Second concept:\n{right}\n### Answer:\n"); //1 - adding more context for anatomy - promts.add("Classify if two concepts refer to the same real word entiy. This is an ontology matching task between the anatomy of human and mouse. \n" + prompts.add("Classify if two concepts refer to the same real word entiy. This is an ontology matching task between the anatomy of human and mouse. \n" + "First concept: {left}\n" + "Second concept: {right}\n" + "Answer:"); //2 - very simple - promts.add("Is {left} and {right} the same? The answer which can be yes or no is "); + prompts.add("Is {left} and {right} the same? The answer which can be yes or no is "); //3 - more context in general - promts.add("The task is ontology matching. Given two concepts, the task is to classify if they are the same or not.\n " + prompts.add("The task is ontology matching. Given two concepts, the task is to classify if they are the same or not.\n " + "The first concept is: {left}\n" + "The second concept is: {right}\n" + "The answer which can be yes or no is:"); //4 - another test - promts.add("Given two concepts decide if they match or not.\n" + prompts.add("Given two concepts decide if they match or not.\n" + "First concept: {left}\n" + "Second concept: {right}\n" + "Answer(yes or no):"); @@ -77,12 +77,12 @@ private static List createPredefinedPromts(){ * Few shot ********************/ // 5 - 2 shot - promts.add("### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + prompts.add("### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + "### Concept one: urinary bladder urothelium ### Concept two: Transitional Epithelium ### Answer: no\n" + "### Concept one: {left} ### Concept two: {right} ### Answer: "); // 6 - 6 shot - promts.add("### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + prompts.add("### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + "### Concept one: urinary bladder urothelium ### Concept two: Transitional Epithelium ### Answer: no\n" + "### Concept one: trigeminal V nerve ophthalmic division ### Concept two: Ophthalmic Nerve ### Answer: yes\n" + "### Concept one: foot digit 1 phalanx ### Concept two: Foot Digit 2 Phalanx ### Answer: no\n" @@ -92,7 +92,7 @@ private static List createPredefinedPromts(){ // 7 - 6 shot with - promts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + prompts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + "### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + "### Concept one: urinary bladder urothelium ### Concept two: Transitional Epithelium ### Answer: no\n" + "### Concept one: trigeminal V nerve ophthalmic division ### Concept two: Ophthalmic Nerve ### Answer: yes\n" @@ -102,20 +102,20 @@ private static List createPredefinedPromts(){ + "### Concept one: {left} ### Concept two: {right} ### Answer: "); // 8 - zero shot chain of thought - promts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + prompts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + "First concept: {left}\n" + "Second concept: {right}\n" + "Answer can be yes or no. Let's think step by step.\n"); // 9 - few shot chain of thought - promts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + prompts.add("Classify if two descriptions refer to the same real world entity (ontology matching).\n" + "### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Explanation: Both describe the process of the pancreas releasing hormones into the bloodstream ### Answer: yes\n" + "### Concept one: foot digit 1 phalanx ### Concept two: Foot Digit 2 Phalanx ### Explanation: The concepts refer to different bones in the toes of the foot ### Answer: no\n" + "### Concept one: {left} ### Concept two: {right} ### Explanation: " ); //10 - which is number 7 plus rdf info - promts.add("Classify if two descriptions (given as RDF) refer to the same real world entity (ontology matching).\n" + prompts.add("Classify if two descriptions (given as RDF) refer to the same real world entity (ontology matching).\n" + "### Concept one: endocrine pancreas secretion ### Concept two: Pancreatic Endocrine Secretion ### Answer: yes\n" + "### Concept one: urinary bladder urothelium ### Concept two: Transitional Epithelium ### Answer: no\n" + "### Concept one: trigeminal V nerve ophthalmic division ### Concept two: Ophthalmic Nerve ### Answer: yes\n" @@ -133,14 +133,14 @@ private static List createPredefinedPromts(){ //zero shot //11 - - promts.add("The task is ontology matching (find the description which refer to the same real world entity). " + prompts.add("The task is ontology matching (find the description which refer to the same real world entity). " + "Which of the following descriptions fits best to this description: {left}?\n" + "{right}" + "Answer with the corresponding letter or \"none\" if no description fits. Answer: "); //few shot //12 - promts.add("The task is ontology matching and to find the description which refer to the same real world entity. " + prompts.add("The task is ontology matching and to find the description which refer to the same real world entity. " + "Which of the following descriptions fits best to this description: endocrine pancreas secretion?\n" + "\t a) Islet of Langerhans\n" + "\t b) Pancreatic Secretion\n" @@ -150,7 +150,7 @@ private static List createPredefinedPromts(){ + "Which of the following descriptions fits best to this description: {left}?\n" + "{right}\n" + "Answer with the corresponding letter or \"none\" if no description fits. Answer:" ); - return promts; + return prompts; } @@ -265,8 +265,8 @@ private Options createOptions(){ .build()); options.addOption(Option.builder("isp") - .longOpt("includesystempromt") - .desc("Include the systempromt for the specific models.") + .longOpt("includesystemprompt") + .desc("Include the systemprompt for the specific models.") .build()); options.addOption(Option.builder("ila") @@ -303,16 +303,16 @@ private Options createOptions(){ .build()); options.addOption(Option.builder("pr") - .longOpt("promt") + .longOpt("prompt") .required() .hasArgs() - .desc("The promts to use - the texts of the resources are inserted by replacing the text {left} and {right}." - + "Can also be a number which then uses a predefined promt. The number can range from 0 to " + (PREDEFINED_PROMTS.size() - 1)) + .desc("The prompts to use - the texts of the resources are inserted by replacing the text {left} and {right}." + + "Can also be a number which then uses a predefined prompt. The number can range from 0 to " + (PREDEFINED_PROMPTS.size() - 1)) .build()); options.addOption(Option.builder("r") .longOpt("replace") - .desc("Replace the user promt") + .desc("Replace the user prompt") .build() ); @@ -410,7 +410,7 @@ public void initializeStaticCmdParameters(){ } } - public boolean isIncludeSystemPromt(){ + public boolean isIncludeSystemPrompt(){ return cmd.hasOption("isp"); } @@ -422,7 +422,7 @@ public boolean isIncludeLoadingArguments(){ return cmd.hasOption("ila"); } - public boolean isReplacePromt(){ + public boolean isReplacePrompt(){ return cmd.hasOption("r"); } @@ -457,50 +457,45 @@ public int getKNeighbours(){ } } - public List> getPromts(TextExtractorMap textExtractorMap){ - List promts = Arrays.asList(cmd.getOptionValues("pr")); - if (promts.isEmpty()) { - LOGGER.warn("No promts specified. ABORTING program."); + public List> getPrompts(TextExtractorMap textExtractorMap){ + List prompts = Arrays.asList(cmd.getOptionValues("pr")); + if (prompts.isEmpty()) { + LOGGER.warn("No prompts specified. ABORTING program."); System.exit(1); } - //process promts - List> finalPromts = new ArrayList<>(); - for(String promt : promts){ - if(promt.equals("7auto")){ - finalPromts.add(new SimpleEntry<>("7auto", getAutoPromt(textExtractorMap, "Classify if two descriptions refer to the same real world entity (ontology matching).\n"))); + //process prompts + List> finalPrompts = new ArrayList<>(); + for(String prompt : prompts){ + if(prompt.equals("7auto")){ + finalPrompts.add(new SimpleEntry<>("7auto", getAutoPrompt(textExtractorMap, "Classify if two descriptions refer to the same real world entity (ontology matching).\n"))); continue; } - if(promt.equals("9auto")){ - finalPromts.add(new SimpleEntry<>("9auto", getAutoPromt(textExtractorMap, "Classify if two descriptions (given as RDF) refer to the same real world entity (ontology matching).\n"))); + if(prompt.equals("9auto")){ + finalPrompts.add(new SimpleEntry<>("9auto", getAutoPrompt(textExtractorMap, "Classify if two descriptions (given as RDF) refer to the same real world entity (ontology matching).\n"))); continue; } - if(promt.equals("12auto")){ - finalPromts.add(new SimpleEntry<>("12auto", getAutoPromtChooser(textExtractorMap))); + if(prompt.equals("12auto")){ + finalPrompts.add(new SimpleEntry<>("12auto", getAutoPromptChooser(textExtractorMap))); continue; } try{ - int promtNumber = Integer.parseInt(promt); + int promptNumber = Integer.parseInt(prompt); //range check - if(promtNumber < 0 || promtNumber >= PREDEFINED_PROMTS.size()){ - LOGGER.warn("Argument promts (-pr) which is set to \"{}\" is not in the range 0-{}.", promt, PREDEFINED_PROMTS.size()-1); + if(promptNumber < 0 || promptNumber >= PREDEFINED_PROMPTS.size()){ + LOGGER.warn("Argument prompts (-pr) which is set to \"{}\" is not in the range 0-{}.", prompt, PREDEFINED_PROMPTS.size()-1); System.exit(1); return null; } - finalPromts.add(new SimpleEntry<>(Integer.toString(promtNumber), PREDEFINED_PROMTS.get(promtNumber))); + finalPrompts.add(new SimpleEntry<>(Integer.toString(promptNumber), PREDEFINED_PROMPTS.get(promptNumber))); } catch(NumberFormatException e){ - finalPromts.add(new SimpleEntry<>(getPromtIdentification(promt), promt)); + finalPrompts.add(new SimpleEntry<>(getPromptIdentification(prompt), prompt)); } - - //if(cmd.hasOption("r")){ - // finalPromt = finalPromt.replace("###", "~~~"); - //} - } - return finalPromts; + return finalPrompts; } - private String getAutoPromt(TextExtractorMap extractor, String initialText){ + private String getAutoPrompt(TextExtractorMap extractor, String initialText){ List list = new ArrayList<>(); list.add(new Correspondence("http://mouse.owl#MA_0002517", "http://human.owl#NCI_C33255", 1.0, CorrespondenceRelation.EQUIVALENCE)); list.add(new Correspondence("http://mouse.owl#MA_0001693", "http://human.owl#NCI_C13318", 1.0, CorrespondenceRelation.INCOMPAT)); @@ -526,7 +521,7 @@ private String getAutoPromt(TextExtractorMap extractor, String initialText){ return sb.toString(); } - private String getAutoPromtChooser(TextExtractorMap extractor){ + private String getAutoPromptChooser(TextExtractorMap extractor){ TestCase tc = TrackRepository.Anatomy.Default.getFirstTestCase(); OntModel source = tc.getSourceOntology(OntModel.class); OntModel target = tc.getTargetOntology(OntModel.class); @@ -549,10 +544,10 @@ private String getText(OntModel m, String url, TextExtractorMap extractor){ //return StringProcessing.normalizeOnlyCamelCaseAndUnderscore(oneValue); } - private String getPromtIdentification(String promt){ - //promt identification is the first word plus a short hash of the promt. - int i = promt.indexOf(' '); - String firstWord = promt; + private String getPromptIdentification(String prompt){ + //prompt identification is the first word plus a short hash of the prompt. + int i = prompt.indexOf(' '); + String firstWord = prompt; if(i >= 0){ firstWord = firstWord.substring(0, i); } @@ -561,7 +556,7 @@ private String getPromtIdentification(String promt){ firstWord = firstWord.substring(0, 15); } - return firstWord+DigestUtils.sha256Hex(promt).substring(0, 7); + return firstWord+DigestUtils.sha256Hex(prompt).substring(0, 7); } diff --git a/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/Main.java b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/Main.java index c47d370531..01eff2a363 100644 --- a/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/Main.java +++ b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/Main.java @@ -4,7 +4,6 @@ import de.uni_mannheim.informatik.dws.melt.matching_base.MeltUtil; import de.uni_mannheim.informatik.dws.melt.matching_data.GoldStandardCompleteness; import de.uni_mannheim.informatik.dws.melt.matching_data.TestCase; -import de.uni_mannheim.informatik.dws.melt.matching_data.TrackRepository; import de.uni_mannheim.informatik.dws.melt.matching_eval.ExecutionResult; import de.uni_mannheim.informatik.dws.melt.matching_eval.ExecutionResultSet; import de.uni_mannheim.informatik.dws.melt.matching_eval.Executor; @@ -25,21 +24,15 @@ import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.metalevel.ConfidenceCombiner; import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.metalevel.ForwardAlwaysMatcher; import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.util.StringProcessing; -import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.util.textExtractors.TextExtractorAllAnnotationProperties; -import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.util.textExtractors.TextExtractorAllLiterals; -import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.util.textExtractors.TextExtractorOnlyLabel; import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.util.textExtractors.TextExtractorSet; import de.uni_mannheim.informatik.dws.melt.matching_ml.python.PythonServer; import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.LLMBase; import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.LLMBinaryFilter; import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.LLMChooseGivenEntityFilter; import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.SentenceTransformersMatcher; -import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.SentenceTransformersPredicateInputAlignment; -import de.uni_mannheim.informatik.dws.melt.matching_owlapi_matchers.AlcomoFilter; import de.uni_mannheim.informatik.dws.melt.yet_another_alignment_api.Alignment; import de.uni_mannheim.informatik.dws.melt.yet_another_alignment_api.Correspondence; import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -174,9 +167,9 @@ private static void run(CLIOptions cliOptions) throws Exception { for(Entry textExtractor : cliOptions.getTextExtractors()){ for(String model : cliOptions.getTransformerModels()) { LLMConfiguration modelConfig = LLMConfiguration.getConfiguration(model); - for(Entry promt : cliOptions.getPromts(textExtractor.getValue())) { + for(Entry promt : cliOptions.getPrompts(textExtractor.getValue())) { - String configurationName = processModelName(model) + "promt" + promt.getKey() + cliOptions.isReplacePromt() + cliOptions.isIncludeSystemPromt() + + String configurationName = processModelName(model) + "promt" + promt.getKey() + cliOptions.isReplacePrompt() + cliOptions.isIncludeSystemPrompt() + "_loading" + cliOptions.isIncludeLoadingArguments() + "_" + textExtractor.getKey() + "_isChoose" + cliOptions.isChoose() + "_" + recallGenerationName; configurationName = configurationName.replaceAll(" ", "_"); @@ -185,10 +178,10 @@ private static void run(CLIOptions cliOptions) throws Exception { //TextExtractorMap modifiedTextExtractor = TextExtractorMap.appendStringPostProcessing(textExtractor.getValue(), StringProcessing::normalizeOnlyCamelCaseAndUnderscore); String finalPromt = promt.getValue(); - if(cliOptions.isReplacePromt()){ + if(cliOptions.isReplacePrompt()){ finalPromt = finalPromt.replace("###", "~~~"); } - if(cliOptions.isIncludeSystemPromt()){ + if(cliOptions.isIncludeSystemPrompt()){ finalPromt = modelConfig.processPromt(finalPromt); } diff --git a/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/SentenceTransformersPredicateBadHosts.java b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/SentenceTransformersPredicateBadHosts.java new file mode 100644 index 0000000000..7ac703aa79 --- /dev/null +++ b/examples/llm-transformers/src/main/java/de/uni_mannheim/informatik/dws/melt/examples/llm_transformers/SentenceTransformersPredicateBadHosts.java @@ -0,0 +1,47 @@ +package de.uni_mannheim.informatik.dws.melt.examples.llm_transformers; + +import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.filter.BadHostsFilter; +import de.uni_mannheim.informatik.dws.melt.matching_ml.python.nlptransformers.SentenceTransformersPredicate; +import de.uni_mannheim.informatik.dws.melt.yet_another_alignment_api.Alignment; +import java.util.Properties; +import org.apache.jena.ontology.OntModel; +import org.apache.jena.ontology.OntResource; + +public class SentenceTransformersPredicateBadHosts implements SentenceTransformersPredicate { + + private String sourceHost; + private String targetHost; + + public SentenceTransformersPredicateBadHosts(){ + this.sourceHost = ""; + this.targetHost = ""; + } + public void init(OntModel source, OntModel target, Alignment inputAlignment, Properties parameters){ + this.sourceHost = BadHostsFilter.getHostURIOfModelBySampling(source); + this.targetHost = BadHostsFilter.getHostURIOfModelBySampling(target); + } + + @Override + public boolean keepSourceEntity(OntResource r){ + return checkResource(r, this.sourceHost); + } + + @Override + public boolean keepTargetEntity(OntResource r){ + return checkResource(r, this.targetHost); + } + + private boolean checkResource(OntResource r, String hostURI){ + String uri = r.getURI(); + if(uri == null) + return false; + String host = BadHostsFilter.getHostOfURI(uri); + if(host == null || host.isEmpty()) + return true; // do not filter -> keep it + if(hostURI.equals(host)){ + return true; // do not filter -> keep it + }else{ + return false; // filter it out + } + } +}