NLeSC · stijnh · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/litstudy/sources/csv.py b/litstudy/sources/csv.py
@@ -116,7 +116,7 @@ def publication_date(self):
 
  for fmt in formats:
  try:
- return datetime.strptime(text, fmt)
+ return datetime.datetime.strptime(text, fmt)
  except Exception:
  pass
 
@@ -167,6 +167,7 @@ def load_csv(
  citation_field: str = None,
  date_field: str = None,
  source_field: str = None,
+ doi_field: str = None,
  filter=None,
 ) -> DocumentSet:
  """Load an abitrary CSV file and parse its contents as a ``DocumentSet``
@@ -190,6 +191,8 @@ def load_csv(
  :param abstract_field: Field name for ``abstract``.
  :param citation_field: Field name for ``citation_count``.
  :param date_field: Field name for ``publication_date`` or
+ :param source_field: Field name for ``source``.
+ :param doi_field: Field name for ``doi``.
  :param filter: Optional function applied to each loaded record. This
  function can be used to, for example, add or delete fields.
 
@@ -309,7 +312,8 @@ def load_csv(
  "pubmed id",
  ],
  ),
- doi=find_field(
+ doi=doi_field
+ or find_field(
  columns,
  [
  "doi",

diff --git a/tests/resources/retraction_watch.csv b/tests/resources/retraction_watch.csv
@@ -0,0 +1,2 @@
+Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes
+4242,Reflections on Research Software,(B/T) Computer Science;(B/T) Data Science;(B/T) Technology;,"Netherlands fScience Center, Nieuw-Amsterdam, Netherlands",Journal of Prominent Things,Prominence Inc,Netherlands,Patrick Bos,,Fake Research Article;,7/31/2024 14:00,10.4242/2024/01,0,7/31/2024 13:59,10.4242/2024/00,0,Retraction,+Concerns/Issues About Reality;+Randomly Generated Content;,No,This is a made-up dummy entry.
diff --git a/tests/test_sources_csv.py b/tests/test_sources_csv.py
@@ -48,3 +48,27 @@ def test_load_scopus_csv():
 
  assert len(doc.authors) == 10
  assert doc.authors[0].name == "Phillips J.C."
+
+def test_load_retraction_watch_csv():
+ path = os.path.dirname(__file__) + "/resources/retraction_watch.csv"
+
+ # let's also go out of our way to make the date field work:
+ def date_filter(d: dict) -> dict:
+ import datetime
+ try:
+ d["date"] = datetime.datetime.strptime(d["OriginalPaperDate"], "%m/%d/%Y %H:%M").date().isoformat()
+ print(d["date"])
+ except ValueError:
+ pass
+ return d
+
+ docs = load_csv(path, doi_field="OriginalPaperDOI", source_field="Journal", filter=date_filter)
+ doc = docs[0]
+
+ assert doc.title == "Reflections on Research Software"
+ assert doc.publication_source == "Journal of Prominent Things"
+ assert doc.language is None
+ assert doc.publication_year == 2024
+
+ assert len(doc.authors) == 1
+ assert doc.authors[0].name == "Patrick Bos"