From cb4cbbffba1c5d4a580971ad909d8f63d49bd9f8 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 9 Feb 2022 14:23:01 +0100 Subject: [PATCH] add OpenCitationsCitedCountDatabase --- python/labe/tasks.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/python/labe/tasks.py b/python/labe/tasks.py index 013b145..0ffe0f3 100644 --- a/python/labe/tasks.py +++ b/python/labe/tasks.py @@ -247,6 +247,33 @@ def on_success(self): self.create_symlink(name="current") +class OpenCitationsCitedCountDatabase(Task): + """ + Generate a database mapping DOI to inbound link count. Could be used to + augment metadata with citation count, which in turn could be used to sort + by this data. + """ + + def requires(self): + return OpenCitationsCitedCountTable() + + def run(self): + output = shellout(r""" + zstdcat -T0 {input} | + makta -T INTEGER -init -o {output} + """, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + fingerprint = self.open_citations_url_hash() + filename = "{}.db".format(fingerprint) + return luigi.LocalTarget(path=self.path(filename=filename), format=Zstd) + + def on_success(self): + self.create_symlink(name="current") + + class SolrFetchDocs(Task): """ Fetch JSON data from SOLR; uses solrdump (https://github.com/ubleipzig/solrdump). @@ -411,5 +438,7 @@ def requires(self): yield SolrDatabase(date=self.date, name="slub-production", short=False) yield IdMappingDatabase(date=self.date) yield OpenCitationsDatabase() + # This is generated, but not used yet. + yield OpenCitationsCitedCountDatabase() # We want OpenCitationsRanked for cache warmup. yield OpenCitationsRanked()