From 2c273d84a1141628ea1d48ba69db05396a34cbb1 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Fri, 21 Apr 2023 12:02:05 -0500 Subject: [PATCH] Add a procfile for background jobs --- Procfile.prod | 5 +++++ Procfile.stage | 6 ++++++ README.md | 13 ++++++++++++- 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 Procfile.prod create mode 100644 Procfile.stage diff --git a/Procfile.prod b/Procfile.prod new file mode 100644 index 000000000..680d5163c --- /dev/null +++ b/Procfile.prod @@ -0,0 +1,5 @@ +marc_bodoni_prod_indexer: JRUBY_OPTS=-J-Xmx8192m /usr/local/rvm/bin/rvm jruby-9.4.1.0 do bundle exec traject -c ./lib/traject/config/sirsi_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_marc_bodoni_prod_indexer.log -s processing_thread_pool=18 -s kafka.topic=marc_bodoni -s kafka.consumer_group_id=traject_marc_bodoni_prod -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-prod -s reserves_path=/data/sirsi/bodoni/crez +marc_morison_prod_indexer: JRUBY_OPTS=-J-Xmx8192m /usr/local/rvm/bin/rvm jruby-9.4.1.0 do bundle exec traject -c ./lib/traject/config/sirsi_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_marc_morison_prod_indexer.log -s processing_thread_pool=4 -s kafka.topic=marc_morison -s kafka.consumer_group_id=traject_marc_morison_prod -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-morison +sdr_prod_indexer_catchup: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/sdr_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_sdr_prod_indexer_catchup.log -s kafka.topic=purl_fetcher_prod -s kafka.consumer_group_id=traject_catchup -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-prod +sdr_preview_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/sdr_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_sdr_preview_indexer.log -s kafka.topic=purl_fetcher_prod -s kafka.consumer_group_id=traject_sdr_preview_prod -s purl_fetcher.target=SearchWorksPreview -s purl_fetcher.skip_catkey=false -s solr.url=http://sul-solr.stanford.edu/solr/sw-preview-prod +earthworks_prod_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/geo_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_earthworks-prod-indexer.log -s kafka.topic=purl_fetcher_prod -s kafka.consumer_group_id=earthworks-prod-indexer -s solr.url=http://sul-solr.stanford.edu/solr/earthworks-prod \ No newline at end of file diff --git a/Procfile.stage b/Procfile.stage new file mode 100644 index 000000000..f23704a5f --- /dev/null +++ b/Procfile.stage @@ -0,0 +1,6 @@ +marc_bodoni_dev_indexer: JRUBY_OPTS=-J-Xmx8192m /usr/local/rvm/bin/rvm jruby-9.4.1.0 do bundle exec traject -c ./lib/traject/config/sirsi_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_marc_bodoni_dev_indexer.log -s processing_thread_pool=2 -s kafka.topic=marc_bodoni -s kafka.consumer_group_id=traject_marc_bodoni_dev -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-dev -s reserves_path=/data/sirsi/bodoni/crez +marc_morison_dev_indexer: JRUBY_OPTS=-J-Xmx8192m /usr/local/rvm/bin/rvm jruby-9.4.1.0 do bundle exec traject -c ./lib/traject/config/sirsi_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_marc_morison_dev_indexer.log -s processing_thread_pool=2 -s kafka.topic=marc_morison -s kafka.consumer_group_id=traject_marc_morison_dev -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-morison-dev -s reserves_path=/data/sirsi/morison/crez +sw_dev_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/sdr_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_sw_dev_indexer.log -s kafka.topic=purl_fetcher_prod -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-dev +sw_preview_stage_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/sdr_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_sw_preview_stage_indexer.log -s kafka.topic=purl_fetcher_stage -s kafka.consumer_group_id=traject_purl_fetcher_stage_sw_preview -s purl_fetcher.target=SearchWorksPreview -s purl_fetcher.skip_catkey=false -s purl.url=https://sul-purl-test.stanford.edu -s solr.url=http://sul-solr.stanford.edu/solr/sw-preview-stage +earthworks_stage_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/geo_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_earthworks-stage-indexer.log -s kafka.topic=purl_fetcher_stage -s kafka.consumer_group_id=earthworks-stage-indexer -s purl.url=https://sul-purl-stage.stanford.edu -s stacks.url=https://sul-stacks-stage.stanford.edu -s geoserver.pub_url=https://earthworks-geoserver-stage-b.stanford.edu/geoserver -s geoserver.stan_url=https://earthworks-geoserver-stage-a.stanford.edu/geoserver -s solr.url=http://sul-solr.stanford.edu/solr/earthworks-stage +folio_dev_indexer: /usr/local/rvm/bin/rvm ruby-3.1.2 do bundle exec traject -c ./lib/traject/config/folio_config.rb -s solr_writer.max_skipped=-1 -s log.level=debug -s log.file=log/traject_folio_dev_indexer.log -s kafka.topic=folio_test -s kafka.consumer_group_id=traject_folio_dev -s reader_class_name=Traject::KafkaFolioReader -s solr.url=http://sul-solr.stanford.edu/solr/searchworks-folio-dev \ No newline at end of file diff --git a/README.md b/README.md index 4bfbd4f7d..e5c0a5617 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,22 @@ bundle exec rake ``` note that some integration tests may hit a live server, for which you may need to be on the Stanford VPN. +## Building services +For development we can use Foreman to run a procfile, but on a deployed machine, we export the rules to systemd: +``` +foreman export -a traject -f Procfile.stage --formation marc_bodoni_dev_indexer=1,marc_morison_dev_indexer=1,folio_dev_indexer=8,sw_dev_indexer=2,sw_preview_stage_indexer=2,earthworks_stage_indexer=1 systemd ~/service_templates +sudo cp /opt/app/indexer/service_templates/* /usr/lib/systemd/system/ + +sudo systemctl enable traject.target +sudo systemctl start traject.target +``` + ## indexing data indexing is a multi-step process: 1. an extractor process publishes data to be indexed to a [kafka](https://kafka.apache.org/) topic -2. a daemon run by [eye](https://github.com/kostya/eye) consumes data from the kafka topic and invokes traject +2. Systemd runs the various traject services 3. traject uses a given configuration to index the data into a solr collection + ### publishing data to kafka extractor processes are written as ruby scripts in `script/` and usually invoked by shell scripts located in the same directory. they make use of traject extractor classes stored in `lib/traject/extractors/`, which use the ruby kafka client to publish data to a kafka topic using the pattern: ```ruby