From eedeeffa399b23a6e3f4061d126bc709f900fbd0 Mon Sep 17 00:00:00 2001 From: Arun Sittampalam Date: Fri, 10 Jun 2016 15:14:40 +0200 Subject: [PATCH 1/5] Working on rake task for synonym list reduction (#38). --- Gemfile.lock | 3 --- lib/tasks/db.rake | 29 ++++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 4af4313..2196ec8 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -211,6 +211,3 @@ DEPENDENCIES spring uglifier (>= 1.3.0) web-console (~> 2.0) - -BUNDLED WITH - 1.11.2 diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index 8817ca1..b7cad20 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -268,4 +268,31 @@ namespace :db do Rake::Task['db:seed_numcase_data'].reenable Rake::Task['db:seed_numcase_data'].invoke(File.join(args.directory, '2013'), '2013') end -end \ No newline at end of file + + desc 'Deletes unnecessary synonyms from synonyms file.' + task :reduce_synonyms, [:directory] => :environment do |t, args| + file_name = File.join(args.directory, 'mesh_2016/synonyms.csv') + csv_contents = CSV.read(file_name, col_sep: ';') + count = `wc -l "#{file_name}"`.to_i + 1 + puts count + pg = ProgressBar.create(total: count, title: "Deleting unnecessary synonyms...") + csv_contents.each do |row| + row.each do |item| + pg.increment + [Adrg, Drg, Mdc].each do |model| + puts "." + ['de', 'fr', 'it'].each do |locale| + result = model.search item, + fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], + limit: @limit, highlight: {tag: ''}, + misspellings: false, execute: false + if result.length == 0 + # TODO + end + end + end + end + end + pg.finish + end +end From 9e5cb71351149c7ac3cdce6439ac8eaa5050e762 Mon Sep 17 00:00:00 2001 From: Arun Sittampalam Date: Fri, 10 Jun 2016 15:40:44 +0200 Subject: [PATCH 2/5] Reduced synonyms are outputted now. --- lib/tasks/db.rake | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index b7cad20..0afaad0 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -274,25 +274,27 @@ namespace :db do file_name = File.join(args.directory, 'mesh_2016/synonyms.csv') csv_contents = CSV.read(file_name, col_sep: ';') count = `wc -l "#{file_name}"`.to_i + 1 - puts count pg = ProgressBar.create(total: count, title: "Deleting unnecessary synonyms...") csv_contents.each do |row| - row.each do |item| + row.delete_if do |item| pg.increment [Adrg, Drg, Mdc].each do |model| - puts "." - ['de', 'fr', 'it'].each do |locale| + ['de'].each do |locale| # at the moment ['de'] is enough because the synonyms list only contains german synonyms result = model.search item, fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], limit: @limit, highlight: {tag: ''}, misspellings: false, execute: false - if result.length == 0 - # TODO - end + result.length == 0 end end end end + + CSV.open(File.join(args.directory, 'mesh_2016/synonyms_reduced.csv',), "w", col_sep: ';') do |csv| + csv_contents.each do |row| + csv << row + end + end pg.finish end end From e3f7307f690c6dcd1eb60793b5b0f5cac2886fc1 Mon Sep 17 00:00:00 2001 From: Arun Sittampalam Date: Mon, 13 Jun 2016 13:31:57 +0200 Subject: [PATCH 3/5] Checking if any entry of a row can be found. If no whole row gets deleted. --- app/models/adrg.rb | 2 +- lib/tasks/db.rake | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/app/models/adrg.rb b/app/models/adrg.rb index c847c34..fffd685 100644 --- a/app/models/adrg.rb +++ b/app/models/adrg.rb @@ -8,7 +8,7 @@ class Adrg < ActiveRecord::Base include MultiLanguageText searchkick word_middle: [:text_de, :text_fr, :text_it], - callbacks: false, language: 'german', batch_size: 50, + callbacks: false, language: 'german', batch_size: 10, synonyms: -> { CSV.read('data/mesh_2016/synonyms.csv', {col_sep: ';'}) } def code_display diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index 0afaad0..8a02e79 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -271,23 +271,28 @@ namespace :db do desc 'Deletes unnecessary synonyms from synonyms file.' task :reduce_synonyms, [:directory] => :environment do |t, args| + Searchkick.timeout = 20 + file_name = File.join(args.directory, 'mesh_2016/synonyms.csv') csv_contents = CSV.read(file_name, col_sep: ';') - count = `wc -l "#{file_name}"`.to_i + 1 + count = csv_contents.length pg = ProgressBar.create(total: count, title: "Deleting unnecessary synonyms...") - csv_contents.each do |row| - row.delete_if do |item| - pg.increment + csv_contents.delete_if do |row| + pg.increment + found = false + puts "Analysing #{row}" + row.each do |item| [Adrg, Drg, Mdc].each do |model| ['de'].each do |locale| # at the moment ['de'] is enough because the synonyms list only contains german synonyms result = model.search item, fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], limit: @limit, highlight: {tag: ''}, misspellings: false, execute: false - result.length == 0 + found = found or result.length == 0 end end end + found end CSV.open(File.join(args.directory, 'mesh_2016/synonyms_reduced.csv',), "w", col_sep: ';') do |csv| From e65903f0a1d0aa2b582ce750fee7910c8cc78584 Mon Sep 17 00:00:00 2001 From: Arun Sittampalam Date: Mon, 13 Jun 2016 15:33:33 +0200 Subject: [PATCH 4/5] Fixed bugs in search query and saving lines to synonyms_reduced.csv while iterating through synonyms.csv instead of at the end. --- lib/tasks/db.rake | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index 8a02e79..b8a05da 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -277,28 +277,31 @@ namespace :db do csv_contents = CSV.read(file_name, col_sep: ';') count = csv_contents.length pg = ProgressBar.create(total: count, title: "Deleting unnecessary synonyms...") - csv_contents.delete_if do |row| - pg.increment - found = false - puts "Analysing #{row}" - row.each do |item| - [Adrg, Drg, Mdc].each do |model| - ['de'].each do |locale| # at the moment ['de'] is enough because the synonyms list only contains german synonyms - result = model.search item, - fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], - limit: @limit, highlight: {tag: ''}, - misspellings: false, execute: false - found = found or result.length == 0 - end - end - end - found - end CSV.open(File.join(args.directory, 'mesh_2016/synonyms_reduced.csv',), "w", col_sep: ';') do |csv| - csv_contents.each do |row| - csv << row - end + csv_contents.each do |row| + pg.increment + found = false + puts "Analysing #{row}" + row.each do |item| + [Adrg, Drg, Mdc].each do |model| + ['de'].each do |locale| # at the moment ['de'] is enough because the synonyms list only contains german synonyms + result = model.search item, + fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], + limit: 1, highlight: {tag: ''}, + misspellings: {edit_distance: 1}, execute: false + #puts "results length #{result.length} ... #{result}" + found = (found or result.length > 0) + end + end + end + if found + puts "Added #{row}" + csv << row + else + puts "Deleted #{row}" + end + end end pg.finish end From bf10d028a894673b38b567ebe0dfa56044575150 Mon Sep 17 00:00:00 2001 From: Arun Sittampalam Date: Mon, 13 Jun 2016 16:19:39 +0200 Subject: [PATCH 5/5] Updated data and removed debug outputs from rake task. --- data | 2 +- lib/tasks/db.rake | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/data b/data index 0915eab..086ee09 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit 0915eabf7759613bd15e28b72eb3ceadff971bb2 +Subproject commit 086ee09a0dc6fc9f0c7ced5b5ea636e90d08755c diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index b8a05da..732fab5 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -269,20 +269,19 @@ namespace :db do Rake::Task['db:seed_numcase_data'].invoke(File.join(args.directory, '2013'), '2013') end - desc 'Deletes unnecessary synonyms from synonyms file.' + desc 'Reads synonyms.csv and writes relevant synonyms to synonyms_reduced.csv.' task :reduce_synonyms, [:directory] => :environment do |t, args| Searchkick.timeout = 20 file_name = File.join(args.directory, 'mesh_2016/synonyms.csv') csv_contents = CSV.read(file_name, col_sep: ';') count = csv_contents.length - pg = ProgressBar.create(total: count, title: "Deleting unnecessary synonyms...") + pg = ProgressBar.create(total: count, title: "Outputting relevant synonyms...") CSV.open(File.join(args.directory, 'mesh_2016/synonyms_reduced.csv',), "w", col_sep: ';') do |csv| csv_contents.each do |row| pg.increment found = false - puts "Analysing #{row}" row.each do |item| [Adrg, Drg, Mdc].each do |model| ['de'].each do |locale| # at the moment ['de'] is enough because the synonyms list only contains german synonyms @@ -290,16 +289,13 @@ namespace :db do fields: ['code^5', {'text_' + locale.to_s + '^2' => :word_middle}, 'relevant_codes_' + locale.to_s], limit: 1, highlight: {tag: ''}, misspellings: {edit_distance: 1}, execute: false - #puts "results length #{result.length} ... #{result}" found = (found or result.length > 0) end end end if found - puts "Added #{row}" csv << row else - puts "Deleted #{row}" end end end