diff --git a/app/jobs/company_register_status_job.rb b/app/jobs/company_register_status_job.rb index 9266b0e088..601c9b1491 100644 --- a/app/jobs/company_register_status_job.rb +++ b/app/jobs/company_register_status_job.rb @@ -3,28 +3,10 @@ class CompanyRegisterStatusJob < ApplicationJob queue_as :default - FILENAME = 'ettevotja_rekvisiidid__lihtandmed.csv.zip' - UNZIP_FILENAME = 'ettevotja_rekvisiidid__lihtandmed.csv' - DESTINATION = 'lib/tasks/data/' - - def perform(days_interval = 14, spam_time_delay = 0.2, batch_size = 100, download_open_data_file_url='https://avaandmed.ariregister.rik.ee/sites/default/files/avaandmed/ettevotja_rekvisiidid__lihtandmed.csv.zip') - - download_open_data_file(download_open_data_file_url, DESTINATION + FILENAME) - unzip_file(FILENAME, DESTINATION) - - codes_in_csv = collect_company_codes(DESTINATION + UNZIP_FILENAME) - + def perform(days_interval = 14, spam_time_delay = 1, batch_size = 100) sampling_registrant_contact(days_interval).find_in_batches(batch_size: batch_size) do |contacts| - contacts.each do |contact| - if codes_in_csv.include?(contact.ident) - proceed_company_status(contact, spam_time_delay) - else - schedule_force_delete(contact) - end - end + contacts.each { |contact| proceed_company_status(contact, spam_time_delay) } end - - remove_temp_file(DESTINATION + UNZIP_FILENAME) end private @@ -36,65 +18,39 @@ def proceed_company_status(contact, spam_time_delay) company_status = contact.return_company_status contact.update!(company_register_status: company_status, checked_company_at: Time.zone.now) - puts company_status case company_status - when Contact::REGISTERED - lift_force_delete(contact) if check_for_force_delete(contact) - when Contact::LIQUIDATED - ContactInformMailer.company_liquidation(contact: contact).deliver_now - when Contact::BANKRUPT || Contact::DELETED - schedule_force_delete(contact) - end - end - - def collect_company_codes(open_data_file_path) - codes_in_csv = [] - CSV.foreach(open_data_file_path, headers: true, col_sep: ';', quote_char: '"', liberal_parsing: true) do |row| - codes_in_csv << row['ariregistri_kood'] - end - - codes_in_csv - end - - def download_open_data_file(url, filename) - uri = URI(url) - - Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http| - request = Net::HTTP::Get.new(uri) - response = http.request(request) - - if response.code == '200' - File.open(filename, 'wb') do |file| - file.write(response.body) - end - else - puts "Failed to download file: #{response.code} #{response.message}" - end + when Contact::REGISTERED + lift_force_delete(contact) if check_for_force_delete(contact) + when Contact::LIQUIDATED + ContactInformMailer.company_liquidation(contact: contact).deliver_now + else + # Here is case when company is not found in the register or it is deleted (Contact::DELETED status) or bankrupt (Contact::BANKRUPT status) + schedule_force_delete(contact) end - puts "File saved as #{filename}" + status = company_status.blank? ? Contact::DELETED : company_status + puts status + update_validation_company_status(contact:contact , status: status) end - def unzip_file(filename, destination) - ::Zip::File.open(destination + filename) do |zip_file| - zip_file.each do |entry| - entry.extract(File.join(destination, entry.name)) { true } - end - end + def sampling_registrant_contact(days_interval) + Registrant.where(ident_type: 'org', ident_country_code: 'EE').where( + "(company_register_status IS NULL OR checked_company_at IS NULL) OR + (company_register_status = ? AND checked_company_at < ?) OR + company_register_status IN (?)", + Contact::REGISTERED, days_interval.days.ago, [Contact::LIQUIDATED, Contact::BANKRUPT, Contact::DELETED] + ) - puts "Archive invoke to #{destination}" end - def sampling_registrant_contact(days_interval) - Registrant.where(ident_type: 'org', ident_country_code: 'EE') - .where('(company_register_status IS NULL) OR - (company_register_status = ? AND (checked_company_at IS NULL OR checked_company_at <= ?)) OR - (company_register_status = ? AND (checked_company_at IS NULL OR checked_company_at <= ?))', - Contact::REGISTERED, days_interval.days.ago, Contact::LIQUIDATED, 1.day.ago) + def update_validation_company_status(contact:, status:) + contact.update(company_register_status: status, checked_company_at: Time.zone.now) end def schedule_force_delete(contact) contact.domains.each do |domain| + next if domain.schedule_force_delete? + domain.schedule_force_delete( type: :fast_track, notify_by_email: true, @@ -115,8 +71,4 @@ def lift_force_delete(contact) domain.lift_force_delete end end - - def remove_temp_file(distination) - FileUtils.rm(distination) if File.exist?(distination) - end end diff --git a/app/models/contact/company_register.rb b/app/models/contact/company_register.rb index 6564880ee9..1c436eff75 100644 --- a/app/models/contact/company_register.rb +++ b/app/models/contact/company_register.rb @@ -1,17 +1,10 @@ module Contact::CompanyRegister extend ActiveSupport::Concern - COMPANY_STATUSES = { - 'r' => 'registered', - 'l' => 'liquidated', - 'n' => 'bankrupt', - 'k' => 'deleted', - }.freeze - - REGISTERED = 'registered'.freeze - LIQUIDATED = 'liquidated'.freeze - BANKRUPT = 'bankrupt'.freeze - DELETED = 'deleted'.freeze + REGISTERED = 'R'.freeze + LIQUIDATED = 'L'.freeze + BANKRUPT = 'N'.freeze + DELETED = 'K'.freeze def company_is_relevant? company_register_status == REGISTERED && company_register_status == LIQUIDATED @@ -20,8 +13,7 @@ def company_is_relevant? def return_company_status return if return_company_data.blank? - status = return_company_data.first[:status].downcase - COMPANY_STATUSES[status] + return_company_data.first[:status] end def return_company_data diff --git a/lib/tasks/check_for_company_status.rake b/lib/tasks/check_for_company_status.rake deleted file mode 100644 index 6aec669324..0000000000 --- a/lib/tasks/check_for_company_status.rake +++ /dev/null @@ -1,37 +0,0 @@ -require 'optparse' -require 'rake_option_parser_boilerplate' -require 'syslog/logger' -require 'active_record' - -DAYS_INTERVAL = 365 -SPAM_TIME_DELAY = 0.3 -BATCH_SIZE = 100 - -namespace :company_status do - # bundle exec rake company_status:check_all -- --days_interval=128 --spam_time_delay=0.3 --batch_size=100 --force_delete=false - - desc 'Starts verifying registrant companies job with optional days interval, spam time delay and batch size' - task check_all: :environment do - options = { - days_interval: DAYS_INTERVAL, - spam_time_delay: SPAM_TIME_DELAY, - batch_size: BATCH_SIZE, - force_delete: "false" - } - - opts_hash = { - days_interval: ["--days_interval=VALUE", Integer], - spam_time_delay: ["--spam_time_delay=VALUE", Float], - batch_size: ["--batch_size=VALUE", Integer], - force_delete: ["--force_delete=VALUE", String] - } - - banner = 'Usage: rake company_status:check_all -- [options]' - options = RakeOptionParserBoilerplate.process_args(options: options, - banner: banner, - hash: opts_hash) - - options[:force_delete] = options[:force_delete].downcase == 'true' - CompanyRegisterStatusJob.perform_later(options[:days_interval], options[:spam_time_delay], options[:batch_size], options[:force_delete]) - end -end diff --git a/lib/tasks/company_status.rake b/lib/tasks/company_status.rake index 79b2615f0e..31d6dd53aa 100644 --- a/lib/tasks/company_status.rake +++ b/lib/tasks/company_status.rake @@ -8,39 +8,45 @@ require 'rake_option_parser_boilerplate' namespace :company_status do - # bundle exec rake company_status:check_for_exists -- --open_data_file_path=lib/tasks/data/ettevotja_rekvisiidid__lihtandmed.csv --missing_companies_output_path=lib/tasks/data/missing_companies_in_business_registry.csv --deleted_companies_output_path=lib/tasks/data/deleted_companies_from_business_registry.csv --download_path=https://avaandmed.ariregister.rik.ee/sites/default/files/avaandmed/ettevotja_rekvisiidid__lihtandmed.csv.zip + # bundle exec rake company_status:check_all -- --open_data_file_path=lib/tasks/data/ettevotja_rekvisiidid__lihtandmed.csv --missing_companies_output_path=lib/tasks/data/missing_companies_in_business_registry.csv --deleted_companies_output_path=lib/tasks/data/deleted_companies_from_business_registry.csv --download_path=https://avaandmed.ariregister.rik.ee/sites/default/files/avaandmed/ettevotja_rekvisiidid__lihtandmed.csv.zip desc 'Get Estonian companies status from Business Registry.' DELETED_FROM_REGISTRY_STATUS = 'K' - FILENAME = 'opendata_business_registry.csv.zip' DESTINATION = 'lib/tasks/data/' + COMPANY_STATUS = 'ettevotja_staatus' + BUSINESS_REGISTRY_CODE = 'ariregistri_kood' - task :check_for_exists => :environment do + task :check_all => :environment do options = initialize_rake_task open_data_file_path = options[:open_data_file_path] missing_companies_in_business_registry_path = options[:missing_companies_output_path] deleted_companies_from_business_registry_path = options[:deleted_companies_output_path] download_path = options[:download_path] - output_file_path = 'lib/tasks/data/temp_missing_companies_output.csv' + downloaded_filename = File.basename(URI(download_path).path) puts "*** Run 1 step. Downloading fresh open data file. ***" - - download_open_data_file(download_path, FILENAME) - unzip_file(FILENAME, DESTINATION) - - # Remove old file - remove_old_file(output_file_path) - - puts "*** Run 2 step. Collecting companies what are not in the open data file. ***" - collect_companies_whats_not_in_open_data_file(open_data_file_path, output_file_path) - - puts "*** Run 3 step. Fetching detailed information from business registry. ***" - sort_missing_companies_to_different_files(output_file_path, missing_companies_in_business_registry_path, deleted_companies_from_business_registry_path) - - puts '*** Run 4 step. Remove temporary files. ***' - remove_old_file(output_file_path) - FileUtils.rm(FILENAME) if File.exist?(FILENAME) + remove_old_file(DESTINATION + downloaded_filename) + download_open_data_file(download_path, downloaded_filename) + unzip_file(downloaded_filename, DESTINATION) + + puts "*** Run 2 step. I am collecting data from open business registry sources. ***" + company_data = collect_company_data(open_data_file_path) + + puts "*** Run 3 step. I process companies, update their information, and sort them into different files based on whether the companies are missing or removed from the business registry ***" + Registrant.where(ident_type: 'org', ident_country_code: 'EE').find_each do |contact| + if company_data.key?(contact.ident) + update_company_status(contact: contact, status: company_data[contact.ident][COMPANY_STATUS]) + puts "Company: #{contact.name} with ident: #{contact.ident} and ID: #{contact.id} has status: #{company_data[contact.ident][COMPANY_STATUS]}" + else + update_company_status(contact: contact, status: 'K') + sort_companies_to_files( + contact: contact, + missing_companies_in_business_registry_path: missing_companies_in_business_registry_path, + deleted_companies_from_business_registry_path: deleted_companies_from_business_registry_path, + ) + end + end puts '*** Done ***' end @@ -90,6 +96,15 @@ namespace :company_status do puts "Archive invoke to #{destination}" end + def collect_company_data(open_data_file_path) + company_data = {} + + CSV.foreach(open_data_file_path, headers: true, col_sep: ';', quote_char: '"', liberal_parsing: true) do |row| + company_data[row[BUSINESS_REGISTRY_CODE]] = row + end + + company_data + end def download_open_data_file(url, filename) uri = URI(url) @@ -110,86 +125,43 @@ namespace :company_status do puts "File saved as #{filename}" end - def collect_companies_whats_not_in_open_data_file(open_data_file_path, output_file_path) - codes_in_csv = collect_company_codes(open_data_file_path) - put_missing_companies_to_file(output_file_path, codes_in_csv) - end - - def collect_company_codes(open_data_file_path) - codes_in_csv = [] - CSV.foreach(open_data_file_path, headers: true, col_sep: ';', quote_char: '"', liberal_parsing: true) do |row| - codes_in_csv << row['ariregistri_kood'] - end - - codes_in_csv + def update_company_status(contact:, status:) + contact.update(company_register_status: status, checked_company_at: Time.zone.now) end - def put_missing_companies_to_file(output_file_path, codes_in_csv) - CSV.open(output_file_path, 'wb', write_headers: true, headers: ["ID", "Code", "Name"]) do |csv| - Contact.where(ident_type: 'org', ident_country_code: 'EE').find_each do |contact| - # [16526891, 14836742, 12489420, 12226399, 12475122].each do |test_ident| - # Contact.where(ident: test_ident).limit(100).each do |contact| - unless codes_in_csv.include?(contact.ident) - csv << [contact.id, contact.ident, contact.name] - end - # end - end - end + def put_company_to_missing_file(contact:, path:) + write_to_csv_file(csv_file_path: path, headers: ["ID", "Ident", "Name"], attrs: [contact.id, contact.ident, contact.name]) end - def sort_missing_companies_to_different_files(output_file_path, missing_companies_in_business_registry_path, deleted_companies_from_business_registry_path) - contact_no_in_business_registry = [] - contact_which_were_deleted = [] - - collect_missing_companies_ids(output_file_path).each do |id| - puts "Fetching data for ID: #{id}" - - contact = Contact.find(id.to_i) - - resp = contact.return_company_details - - if resp.empty? - contact_no_in_business_registry << [contact.id, contact.ident, contact.name] - else - status = resp.first.status.upcase - kandeliik_type = resp.first.kandeliik.last.last.kandeliik - kandeliik_tekstina = resp.first.kandeliik.last.last.kandeliik_tekstina - kande_kpv = resp.first.kandeliik.last.last.kande_kpv - - if status == DELETED_FROM_REGISTRY_STATUS - contact_which_were_deleted << [contact.id, contact.ident, contact.name, status, kandeliik_type, kandeliik_tekstina, kande_kpv] - end + def sort_companies_to_files(contact:, missing_companies_in_business_registry_path:, deleted_companies_from_business_registry_path:) + sleep 1 + resp = contact.return_company_details + + if resp.empty? + put_company_to_missing_file(contact: contact, path: missing_companies_in_business_registry_path) + puts "Company: #{contact.name} with ident: #{contact.ident} and ID: #{contact.id} is missing in registry, company id: #{contact.id}" + else + status = resp.first.status.upcase + kandeliik_type = resp.first.kandeliik.last.last.kandeliik + kandeliik_tekstina = resp.first.kandeliik.last.last.kandeliik_tekstina + kande_kpv = resp.first.kandeliik.last.last.kande_kpv + + if status == DELETED_FROM_REGISTRY_STATUS + csv_file_path = deleted_companies_from_business_registry_path + headers = ["ID", "Ident", "Name", "Status", "Kandeliik Type", "Kandeliik Tekstina", "kande_kpv"] + attrs = [contact.id, contact.ident, contact.name, status, kandeliik_type, kandeliik_tekstina, kande_kpv] + write_to_csv_file(csv_file_path: csv_file_path, headers: headers, attrs: attrs) + + puts "Company: #{contact.name} with ident: #{contact.ident} and ID: #{contact.id} has status #{status}, company id: #{contact.id}" end - - sleep 1 end - - save_missing_companies(contact_no_in_business_registry, missing_companies_in_business_registry_path) - save_deleted_companies(contact_which_were_deleted, deleted_companies_from_business_registry_path) end - def collect_missing_companies_ids(output_file_path) - ids = [] - CSV.foreach(output_file_path, headers: true, quote_char: '"', liberal_parsing: true) do |row| - ids << row['ID'] - end + def write_to_csv_file(csv_file_path:, headers:, attrs:) + write_headers = !File.exist?(csv_file_path) - ids - end - - def save_missing_companies(contact_no_in_business_registry, missing_companies_in_business_registry_path) - CSV.open(missing_companies_in_business_registry_path, 'wb', write_headers: true, headers: ["ID", "Code", "Name"]) do |csv| - contact_no_in_business_registry.each do |entry| - csv << entry - end - end - end - - def save_deleted_companies(contact_which_were_deleted, deleted_companies_from_business_registry_path) - CSV.open(deleted_companies_from_business_registry_path, 'wb', write_headers: true, headers: ["ID", "Ident", "Name", "Status", "Kandeliik Type", "Kandeliik Tekstina", "kande_kpv"]) do |csv| - contact_which_were_deleted.each do |entry| - csv << entry - end + CSV.open(csv_file_path, "ab", write_headers: write_headers, headers: headers) do |csv| + csv << attrs end end end