Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

731 Performance Analysis for Nihms Loader #63

Merged
merged 12 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,21 @@ public class PmidLookup {
".fcgi?db=pubmed&retmode=json&rettype=abstract&id=%s";

private static final String ENTREZ_PATH_KEY = "entrez.pmid.path";
private static final String ENTREZ_TIME_OUT_KEY = "entrez.time.out";
private static final String DEFAULT_ENTREZ_TIME_OUT = "400";

private static final String JSON_ERROR_KEY = "error";
private static final String JSON_RESULT_KEY = "result";

private String entrezPath;
private String entrezTimeout;

/**
* Default constructor uses the default Entrez path
*/
public PmidLookup() {
entrezPath = System.getProperty(ENTREZ_PATH_KEY, DEFAULT_ENTREZ_PATH);
entrezTimeout = System.getProperty(ENTREZ_TIME_OUT_KEY, DEFAULT_ENTREZ_TIME_OUT);
}

/**
Expand Down Expand Up @@ -101,7 +105,7 @@ public JSONObject retrievePubMedRecordAsJson(String pmid) {
if (jsonRecord == null) {
// pause and retry once to allow for API limitations
LOG.info("Pausing before trying to pull PMID {} from Entrez again", pmid);
TimeUnit.MILLISECONDS.sleep(400);
TimeUnit.MILLISECONDS.sleep(Long.parseLong(entrezTimeout));
jsonRecord = retrieveJsonFromApi(pmid);
}
} catch (InterruptedException e) {
Expand Down
16 changes: 12 additions & 4 deletions pass-nihms-loader/nihms-data-transform-load/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@
<artifactId>logback-classic</artifactId>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.2</version>
<scope>compile</scope>
</dependency>

<!-- Test dependencies -->
<dependency>
<groupId>org.mockito</groupId>
Expand All @@ -84,11 +91,12 @@
<version>4.8.1</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.2</version>
<scope>compile</scope>
<groupId>org.wiremock</groupId>
<artifactId>wiremock</artifactId>
<version>3.2.0</version>
<scope>test</scope>
</dependency>
</dependencies>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
package org.eclipse.pass.loader.nihms;

import static com.github.tomakehurst.wiremock.client.WireMock.get;
import static com.github.tomakehurst.wiremock.client.WireMock.ok;
import static com.github.tomakehurst.wiremock.client.WireMock.stubFor;
import static com.github.tomakehurst.wiremock.client.WireMock.urlMatching;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.File;
import java.util.List;

import com.github.tomakehurst.wiremock.junit5.WireMockRuntimeInfo;
import com.github.tomakehurst.wiremock.junit5.WireMockTest;
import org.eclipse.pass.support.client.PassClientSelector;
import org.eclipse.pass.support.client.RSQL;
import org.eclipse.pass.support.client.model.Grant;
Expand All @@ -22,6 +28,7 @@
*
* @author Karen Hanson
*/
@WireMockTest
public class TransformAndLoadSmokeIT extends NihmsSubmissionEtlITBase {

@BeforeEach
Expand All @@ -36,28 +43,37 @@ public void setup() throws Exception {
* @throws Exception if an error occurs
*/
@Test
public void smokeTestLoadAndTransform() throws Exception {
public void smokeTestLoadAndTransform(WireMockRuntimeInfo wmRuntimeInfo) throws Exception {
final int wmPort = wmRuntimeInfo.getHttpPort();
System.setProperty("entrez.pmid.path", "http://localhost:" + wmPort +
"/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=%s");
System.setProperty("entrez.time.out", "0");
String jsonErrorResponse = "{\"error\": \"cannot get document summary\"}";

stubFor(get(urlMatching("/entrez/eutils/esummary.fcgi\\?db=pubmed&retmode=json&rettype=abstract&id=([0-9]*)"))
.willReturn(ok(jsonErrorResponse)));

NihmsTransformLoadApp app = new NihmsTransformLoadApp(null);

app.run();

PassClientSelector<RepositoryCopy> repoCopySelector = new PassClientSelector<>(RepositoryCopy.class);
PassClientSelector<Publication> publicationSelector = new PassClientSelector<>(Publication.class);
PassClientSelector<Submission> submissionSelector = new PassClientSelector<>(Submission.class);

//now that it has run lets do some basic tallys to make sure they are as expected:

//make sure RepositoryCopies are all in before moving on so we can be sure the counts are done.
repoCopySelector.setFilter(RSQL.notEquals("id", "-1"));
List<RepositoryCopy> repositoryCopies = passClient.selectObjects(repoCopySelector).getObjects();
assertEquals(26, repositoryCopies.size());
assertEquals(23, repositoryCopies.size());

publicationSelector.setFilter(RSQL.notEquals("id", "-1"));
List<Publication> publications = passClient.selectObjects(publicationSelector).getObjects();
assertEquals(37, publications.size());
assertEquals(32, publications.size());

submissionSelector.setFilter(RSQL.notEquals("id", "-1"));
List<Submission> submissions = passClient.selectObjects(submissionSelector).getObjects();
assertEquals(37, submissions.size());
assertEquals(32, submissions.size());

//reset file names:
File downloadDir = new File(path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@ PMID,PMCID,NIHMSID,Grant Number,PI Name,Publication Date,NIHMS file deposited,NI
78787878,,787878,P20 HHHHHH,"COFFEE, K",6/13/2017,5/30/2017,5/30/2017,7/10/2017,,Article O,not used,not used,not used,no,not used
89898989,,898989,N01 IIIIII,"BLUEMKE, DAVID A",11/7/2017,6/16/2017,6/26/2017,8/9/2017,,Article P,not used,not used,not used,no,not used
89898989,,898989,UL1 JJJJJJ,"AQUA, E",11/7/2017,6/16/2017,6/26/2017,8/9/2017,,Article P,not used,not used,not used,no,not used
90909090,,909090,P30 KKKKKK,"CLAY, F",1/1/2018,2/1/2018,2/1/2018,,,Article Q,not used,not used,not used,no,not used
98989898,,989898,U01 LLLLLL,"MINT, A",7/24/2017,1/26/2018,1/26/2018,,,Article R,not used,not used,not used,no,not used
87878787,,878787,T32 MMMMMM,"GRAY, I",8/1/2017,8/13/2017,,,,Article S,not used,not used,not used,no,not used
87878787,,878787,T32 NNNNNN,"IVORY, H",8/1/2017,8/13/2017,,,,Article S,not used,not used,not used,no,not used
76767676,,767676,R01 OOOOOO,"MAROON, P",12/1/2017,12/20/2017,12/20/2017,1/9/2018,,Article T,not used,not used,not used,no,not used
65656565,,656565,R01 PPPPPP,"SAGE, X",9/22/2017,9/18/2017,,10/2/2017,,Article U,not used,not used,not used,no,not used
65656565,,656565,R01 QQQQQQ,"SAGE, X",9/22/2017,9/18/2017,,10/2/2017,,Article U,not used,not used,not used,no,not used
54545454,,545454,T32 RRRRRR,"SAGE, X",9/22/2017,9/18/2017,,10/2/2017,,Article U,not used,not used,not used,no,not used
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
PMID,PMCID,NIHMSID,Grant Number,PI Name,Publication Date,NIHMS file deposited,NIHMS initial approval,NIHMS tagging complete,NIHMS final approval,Article Title,First Author Name,Journal Title,Journal Publisher,Method A Journal,NIHMS Person
13131313,,131313,T32 KKKKKK,"CHOCOLATE, A",7/1/2017,2/23/2016,3/9/2016,3/21/2016,,Article PQ,not used,not used,not used,no,not used
24242424,,,T32 LLLLLL,"CRIMSON, B",6/1/2017,,,,,Article OP,not used,not used,not used,no,not used
35353535,,,K23 MMMMMM,"NAVY, E",10/1/2017,,,,,Article NO,not used,not used,not used,no,not used
46464646,,,T32 NNNNNN,"IVORY, H",2/1/2018,,,,,Article MN,not used,not used,not used,no,not used
57575757,,,TL1 OOOOOO,"MINT, D",12/1/2017,,,,,Article LM,not used,not used,not used,no,not used
68686868,,686868,T32 PPPPPP,"RED, F",3/1/2017,8/15/2017,,,,Article KL,not used,not used,not used,no,not used
79797979,,,R01 QQQQQQ,"SAGE, X",4/1/2017,,,,,Article JK,not used,not used,not used,no,not used
79797979,,,R01 RRRRRR,"TEAL, K",3/1/2017,,,,,Article V,not used,not used,not used,no,not used
79797979,,,K24 SSSSSS,"TEAL, K",3/1/2017,,,,,Article V,not used,not used,not used,no,not used
80808080,,,N01 TTTTTT,"SLATE, M",4/1/2017,,,,,Article EF,not used,not used,not used,no,not used
97979797,,,P50 UUUUUU,"SANDY, F",4/15/2017,,,,,Article FG,not used,not used,not used,no,not used
86868686,,,P30 VVVVVV,"TAN, R",3/1/2017,,,,,Article GH,not used,not used,not used,no,not used
75757575,,,R01 WWWWWW,"GOLDEN, Q",2/1/2017,,,,,Article HI,not used,not used,not used,no,not used
86868686,,,P30 VVVVVV,"TAN, R",3/1/2017,,,,,Article GH,not used,not used,not used,no,not usedd
64646464,,646464,T32 XXXXXX,"LIME, N",5/1/2017,9/15/2017,,,,Article IJ,not used,not used,not used,no,not used
53535353,,,R01 YYYYYY,"CYAN, S",12/1/2017,,,,,Article AB,not used,not used,not used,no,not used
42424242,,424242,P30 ZZZZZZ,"OLIVE, K",5/1/2017,1/30/2017,2/9/2017,3/7/2017,,Article CD,not used,not used,not used,no,not used
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,7 @@ public List<Submission> findSubmissionsByPublicationAndUserId(String pubId, Stri
subSelector.setFilter(subFilter);
subSelector.setInclude("publication", "repositories", "submitter", "grants");
PassClientResult<Submission> subResult = passClient.selectObjects(subSelector);
List<Submission> submissions = subResult.getObjects();

return submissions;
return subResult.getObjects();
}

/**
Expand Down Expand Up @@ -625,7 +623,7 @@ public boolean updateDeposit(Deposit deposit) throws IOException {
}

private static String userIdPubIdKey(String userId, String pubId) {
return userId.toString() + pubId.toString();
return userId.toString() + pubId;
}

}
Loading