From 7a38885b4ebaa85a8803beb07509f88d98638d54 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 21 Jul 2016 20:23:26 +0200 Subject: [PATCH] Splits fetching data from the web (repositories, identities, contributors and so on) in to parallel batches - introducing parallel gem #321 --- Gemfile | 42 ++++---- _ext/github.rb | 6 +- _ext/identities.rb | 2 +- _ext/identities/github.rb | 15 +-- _ext/identities/gravatar.rb | 4 +- _ext/jira.rb | 8 +- _ext/lanyrd.rb | 28 +++-- _ext/repository.rb | 197 +++++++++++++++++----------------- _ext/restclient_extensions.rb | 2 +- 9 files changed, 159 insertions(+), 145 deletions(-) diff --git a/Gemfile b/Gemfile index 484603c2624..7ccc8bd81dc 100644 --- a/Gemfile +++ b/Gemfile @@ -29,26 +29,30 @@ source 'https://rubygems.org' -#gem "awestruct", "0.5.7" -gem "awestruct", :path => '../awestruct' -#gem "awestruct", :git => 'git@github.com:awestruct/awestruct.git' +#gem 'awestruct', '0.5.7' +gem 'awestruct', :path => '../awestruct' +#gem 'awestruct', :git => 'git@github.com:awestruct/awestruct.git' +gem 'git', '1.2.9.1' # newer version incorrectly reads UTF-8 enconded commits (authors for example) gem 'asciidoctor' gem 'haml-contrib' gem 'bootstrap-sass', '< 3.0' -gem "puma" -gem "rest-client", ">= 2.0.0" -gem "hpricot" -gem "RedCloth" -gem "redcarpet" -gem "coffee-script" -gem "uglifier" -gem "htmlcompressor" -gem "compass", "0.12.7" -gem "ri_cal", "0.8.8" -gem "tzinfo", "0.3.33" -gem "therubyracer", "0.11.3" -gem "jruby-openssl", "0.7.7", :platforms => :jruby -gem "rb-inotify", :platforms => [:ruby, :jruby] -gem "versionomy" +gem 'puma' +gem 'rest-client', '>= 2.0.0' +gem 'hpricot' +gem 'RedCloth' +gem 'redcarpet' +gem 'coffee-script' +gem 'uglifier' + +gem 'htmlcompressor' +gem 'compass', '0.12.7' +gem 'ri_cal', '0.8.8' +gem 'tzinfo', '0.3.33' +gem 'therubyracer', '0.11.3' +gem 'jruby-openssl', '0.7.7', :platforms => :jruby +gem 'rb-inotify', :platforms => [:ruby, :jruby] +gem 'versionomy' gem 'rspec', '>= 2.9' -gem "nokogiri", "1.6.8" +gem 'nokogiri', '1.6.8' +gem 'ruby-progressbar' +gem 'parallel' diff --git a/_ext/github.rb b/_ext/github.rb index 8b7836f6c1a..5ca46294fbb 100644 --- a/_ext/github.rb +++ b/_ext/github.rb @@ -1,4 +1,6 @@ # -*- encoding : utf-8 -*- +require 'parallel' + module Awestruct::Extensions::Github DEFAULT_BASE_URL = 'https://api.github.com' @@ -27,7 +29,7 @@ def execute(site) milestones_data = RestClient.get milestones_url, :accept => 'application/json', :cache_key => "github/milestones_project-#{@project_key}.json", :cache_expiry_age => DURATION_1_DAY - milestones_data.content.each do |m| + Parallel.each(milestones_data.content, progress: "Fetching milestones of [#{milestones_url}] ") { |m| release_key = m['title'] release_key = "#{@prefix_version}_#{release_key}" unless @prefix_version.nil? @@ -49,7 +51,7 @@ def execute(site) end site.release_notes[release_key] = release_notes - end + } end end diff --git a/_ext/identities.rb b/_ext/identities.rb index aea4b6bc291..af4bbccb082 100644 --- a/_ext/identities.rb +++ b/_ext/identities.rb @@ -78,7 +78,7 @@ def unique_by_emails(contributors) def lookup_by_contributor(contributor) identity = self.find {|e| e.contributor and e.contributor.emails and e.contributor.emails.include? contributor.email } - # identity = lookup_by_emails(contributor.email) + # identity = lookup_by_emails(contributor.email) if identity.nil? # identity = lookup_by_name(contributor.name) if identity.nil? if identity.nil? # Indication that we have a mismatched account diff --git a/_ext/identities/github.rb b/_ext/identities/github.rb index 47719560c14..eb3b12c937a 100644 --- a/_ext/identities/github.rb +++ b/_ext/identities/github.rb @@ -1,4 +1,5 @@ # -*- encoding : utf-8 -*- +require 'parallel' module Identities module GitHub @@ -35,17 +36,17 @@ def add_repository(repository) def add_match_filter(match_filter) @match_filters << match_filter - #File.open('/tmp/committers.yml', 'w') do |out| + #File.open('/tmp/committers.yml', 'w:UTF-8') do |out| # YAML.dump(match_filter, out) #end end def collect(identities) - visited = [] - @repositories.each do |r| + visited = Parallel.each(@repositories, progress: + 'Processing contributors from GitHub') { |r| url = CONTRIBUTORS_URL_TEMPLATE % [ r.owner, r.path ] contributors = RestClient.get url, :accept => 'application/json' - contributors.content.each do |acct| + contributors.content.each { |acct| github_id = acct['login'].downcase author = nil @match_filters.each do |filter| @@ -58,9 +59,9 @@ def collect(identities) end identity = identities.lookup_by_github_id(github_id, true) github_acct_to_identity(acct, author, identity) - visited << github_id - end - end + github_id + } + }.flatten # github doesn't keep perfect records of contributors, so handle those omitted contributors @match_filters.each do |filter| diff --git a/_ext/identities/gravatar.rb b/_ext/identities/gravatar.rb index b95c155e043..d3f9ff3c990 100644 --- a/_ext/identities/gravatar.rb +++ b/_ext/identities/gravatar.rb @@ -23,7 +23,7 @@ def crawl(identity) return end url = API_URL_TEMPLATE % hash - response = RestClient.get(url, :user_agent => "rest-client") do |rsp, req, res, &blk| + response = RestClient.get(url, :user_agent => "rest-client", :accept => 'application/json') do |rsp, req, res, &blk| if rsp.code.eql? 404 rsp = RestClient::Response.create('{}', rsp.net_http_res, req) rsp.instance_variable_set(:@code, 200) @@ -33,7 +33,7 @@ def crawl(identity) end end - data = JSON.parse response + data = response.content if data.empty? return end diff --git a/_ext/jira.rb b/_ext/jira.rb index 788d2571eab..2271cd21baf 100644 --- a/_ext/jira.rb +++ b/_ext/jira.rb @@ -1,4 +1,5 @@ # -*- encoding : utf-8 -*- +require 'parallel' module Awestruct::Extensions::Jira DEFAULT_BASE_URL = 'https://issues.jboss.org' @@ -29,7 +30,8 @@ def execute(site) url = @base_url + (PROJECT_PATH_TEMPLATE % @project_key) project_data = RestClient.get url, :accept => 'application/json', :cache_key => "jira/project-#{@project_key}.json", :cache_expiry_age => DURATION_1_DAY - project_data.content['versions'].each do |v| + Parallel.each(project_data.content['versions'], + progress: "Fetching release notes from JIRA of [#{url}]") { |v| next if !v['released'] release_key = v['name'] release_key = "#{@prefix_version}_#{release_key}" unless @prefix_version.nil? @@ -51,7 +53,7 @@ def execute(site) end site.release_notes[release_key] = release_notes - end + } end end @@ -67,7 +69,7 @@ def execute(site) url = @base_url + (COMPONENTS_PATH_TEMPLATE % @project_key) components = RestClient.get url, :accept => 'application/json', :cache_key => "jira/components-#{@project_key}.json" - components.content.each do |c| + Parallel.each(components.content, progress: "Fetching component leads data from JIRA of of [#{url}]") do |c| component_data = RestClient.get(c['self'], :accept => 'application/json', :cache_key => "jira/component-#{@project_key}-#{c['id']}.json").content if component_data.has_key? 'lead' and component_data['description'] =~ / :: ([^ ]+)$/ diff --git a/_ext/lanyrd.rb b/_ext/lanyrd.rb index 573f467a0eb..2709c2a14c3 100644 --- a/_ext/lanyrd.rb +++ b/_ext/lanyrd.rb @@ -3,6 +3,7 @@ require 'tzinfo' require 'nokogiri' require 'rest-client' +require 'parallel' require_relative 'common.rb' ## # Lanyrd is an Awestruct extension module for interacting with lanyrd.com @@ -60,19 +61,24 @@ def execute(site) # context=future only works for conferences atm...waiting for session search support search_url = "#{@base}/search/?type=session&q=#{@term}" - sessions = [] + sessions = Hamster::Set.new - pages = [] - - page1 = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-1.html"), search_url)) - pages << page1 + pages = Hamster::Set.new + begin + page1 = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-1.html"), search_url)) + rescue => e + puts e + site.sessions = [] + return + end + pages = pages << page1 extract_pages(page1, pages, search_url) - pages.each do |page| + Parallel.each(pages, progress: "Extracting sessions details from Lanyrd" ) do |page| extract_sessions(page, sessions) end - - site.sessions = sessions.sort_by{|session| session.start_datetime} + sessions = sessions.sort_by{|session| session.start_datetime} + site.sessions = sessions.size.times.map { |i| session.at(i) } end # Find all Pages in a 'root' Page @@ -83,11 +89,11 @@ def extract_pages(root, pages, search_url) root.css('div[@class*=pagination]').each do |p| last_page_index = Integer(p.search('li').last.at('a').inner_text) +1 end - for index in 2...last_page_index + Parallel.each(2...last_page_index, progress: "Fetching Arquillian sessions from Lanyrd") { |index| pageinated_url = "#{search_url}&page=#{index}" pageX = Nokogiri::HTML(getOrCache(File.join(@lanyrd_tmp, "search-#{@term}-#{index}.html"), pageinated_url)) pages << pageX - end + } end # Find all sessions in Page @@ -166,7 +172,7 @@ def extract_sessions(page, sessions) session.speakers << {'name' => name, 'username' => username } end - sessions << session + sessions = sessions << session end end end diff --git a/_ext/repository.rb b/_ext/repository.rb index 548531fec51..cf6d186fe5c 100644 --- a/_ext/repository.rb +++ b/_ext/repository.rb @@ -26,27 +26,27 @@ def execute(site) cache_key = "ohloh/enlistments-#{@ohloh_project_id}-#{page}.xml" # expire after 3 days doc = REXML::Document.new RestClient.get url, :accept => 'application/xml', - :cache_key => cache_key, :cache_expiry_age => 86400 * 3 + :cache_key => cache_key, :cache_expiry_age => 86400 * 3 doc.each_element('/response/result/enlistment/repository/url') do |e| - git_url = e.text + git_url = e.text path = File.basename(git_url.split('/').last, '.git') repository = OpenStruct.new({ - :path => path, - :relative_path => '', - :desc => nil, - :owner => git_url.split('/').last(2).first, - :host => URI(git_url).host, - :type => 'git', - :html_url => git_url.chomp('.git').sub('git://', 'https://'), - :clone_url => git_url - }) + :path => path, + :relative_path => '', + :desc => nil, + :owner => git_url.split('/').last(2).first, + :host => URI(git_url).host, + :type => 'git', + :html_url => git_url.chomp('.git').sub('git://', 'https://'), + :clone_url => git_url + }) @repositories << repository end offset = doc.root.elements['first_item_position'].text.to_i returned = doc.root.elements['items_returned'].text.to_i available = doc.root.elements['items_available'].text.to_i - + if offset + returned < available page += 1 else @@ -55,114 +55,114 @@ def execute(site) end @repositories << OpenStruct.new( - :path => 'arquillian-universe-bom', - :desc => nil, - :relative_path => '', - :owner => 'arquillian', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/arquillian/arquillian-universe-bom/commits{/sha}', - :html_url => 'https://github.com/arquillian/arquillian-universe-bom', - :clone_url => 'git://github.com/arquillian/arquillian-universe-bom.git' + :path => 'arquillian-universe-bom', + :desc => nil, + :relative_path => '', + :owner => 'arquillian', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/arquillian/arquillian-universe-bom/commits{/sha}', + :html_url => 'https://github.com/arquillian/arquillian-universe-bom', + :clone_url => 'git://github.com/arquillian/arquillian-universe-bom.git' ) @repositories << OpenStruct.new( - :path => 'shrinkwrap', - :desc => nil, - :relative_path => '', - :owner => 'shrinkwrap', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/shrinkwrap/shrinkwrap/commits{/sha}', - :html_url => 'https://github.com/shrinkwrap/shrinkwrap', - :clone_url => 'git://github.com/shrinkwrap/shrinkwrap.git' + :path => 'shrinkwrap', + :desc => nil, + :relative_path => '', + :owner => 'shrinkwrap', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/shrinkwrap/shrinkwrap/commits{/sha}', + :html_url => 'https://github.com/shrinkwrap/shrinkwrap', + :clone_url => 'git://github.com/shrinkwrap/shrinkwrap.git' ) @repositories << OpenStruct.new( - :path => 'resolver', - :desc => nil, - :relative_path => '', - :owner => 'shrinkwrap', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/shrinkwrap/resolver/commits{/sha}', - :html_url => 'https://github.com/shrinkwrap/resolver', - :clone_url => 'git://github.com/shrinkwrap/resolver.git' + :path => 'resolver', + :desc => nil, + :relative_path => '', + :owner => 'shrinkwrap', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/shrinkwrap/resolver/commits{/sha}', + :html_url => 'https://github.com/shrinkwrap/resolver', + :clone_url => 'git://github.com/shrinkwrap/resolver.git' ) @repositories << OpenStruct.new( - :path => 'descriptors', - :desc => nil, - :relative_path => '', - :owner => 'shrinkwrap', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/shrinkwrap/descriptors/commits{/sha}', - :html_url => 'https://github.com/shrinkwrap/descriptors', - :clone_url => 'git://github.com/shrinkwrap/descriptors.git' + :path => 'descriptors', + :desc => nil, + :relative_path => '', + :owner => 'shrinkwrap', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/shrinkwrap/descriptors/commits{/sha}', + :html_url => 'https://github.com/shrinkwrap/descriptors', + :clone_url => 'git://github.com/shrinkwrap/descriptors.git' ) @repositories << OpenStruct.new( - :path => 'descriptors-docker', - :desc => nil, - :relative_path => '', - :owner => 'shrinkwrap', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/shrinkwrap/descriptors-docker/commits{/sha}', - :html_url => 'https://github.com/shrinkwrap/descriptors-docker', - :clone_url => 'git://github.com/shrinkwrap/descriptors-docker.git' + :path => 'descriptors-docker', + :desc => nil, + :relative_path => '', + :owner => 'shrinkwrap', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/shrinkwrap/descriptors-docker/commits{/sha}', + :html_url => 'https://github.com/shrinkwrap/descriptors-docker', + :clone_url => 'git://github.com/shrinkwrap/descriptors-docker.git' ) @repositories << OpenStruct.new( - :path => 'shrinkwrap-osgi', - :desc => nil, - :relative_path => '', - :owner => 'shrinkwrap', - :host => 'github.com', - :type => 'git', - :commits_url => 'https://api.github.com/repos/shrinkwrap/shrinkwrap-osgi/commits{/sha}', - :html_url => 'https://github.com/shrinkwrap/shrinkwrap-osgi', - :clone_url => 'git://github.com/shrinkwrap/shrinkwrap-osgi.git' + :path => 'shrinkwrap-osgi', + :desc => nil, + :relative_path => '', + :owner => 'shrinkwrap', + :host => 'github.com', + :type => 'git', + :commits_url => 'https://api.github.com/repos/shrinkwrap/shrinkwrap-osgi/commits{/sha}', + :html_url => 'https://github.com/shrinkwrap/shrinkwrap-osgi', + :clone_url => 'git://github.com/shrinkwrap/shrinkwrap-osgi.git' ) @repositories << OpenStruct.new( - :path => 'wildfly-arquillian', - :desc => nil, - :relative_path => '', - :owner => 'wildfly', - :host => 'github.com', - :type => 'git', - :html_url => 'https://github.com/wildfly/wildfly-arquillian', - :clone_url => 'git://github.com/wildfly/wildfly-arquillian.git' + :path => 'wildfly-arquillian', + :desc => nil, + :relative_path => '', + :owner => 'wildfly', + :host => 'github.com', + :type => 'git', + :html_url => 'https://github.com/wildfly/wildfly-arquillian', + :clone_url => 'git://github.com/wildfly/wildfly-arquillian.git' ) @repositories << OpenStruct.new( - :path => 'tomee', - :desc => nil, - :relative_path => 'arquillian/', - :owner => 'apache', - :host => 'github.com', - :type => 'git', - :html_url => 'https://github.com/apache/tomee', - :clone_url => 'git://github.com/apache/tomee.git' + :path => 'tomee', + :desc => nil, + :relative_path => 'arquillian/', + :owner => 'apache', + :host => 'github.com', + :type => 'git', + :html_url => 'https://github.com/apache/tomee', + :clone_url => 'git://github.com/apache/tomee.git' ) @repositories << OpenStruct.new( - :path => 'jsfunit', - :desc => nil, - :relative_path => 'jboss-jsfunit-arquillian/', - :owner => 'jsfunit', - :host => 'github.com', - :type => 'git', - :html_url => 'https://github.com/jsfunit/jsfunit', - :clone_url => 'git://github.com/jsfunit/jsfunit.git' + :path => 'jsfunit', + :desc => nil, + :relative_path => 'jboss-jsfunit-arquillian/', + :owner => 'jsfunit', + :host => 'github.com', + :type => 'git', + :html_url => 'https://github.com/jsfunit/jsfunit', + :clone_url => 'git://github.com/jsfunit/jsfunit.git' ) @repositories << OpenStruct.new( - :path => 'plugin-arquillian', - :desc => nil, - :relative_path => '', - :owner => 'forge', - :host => 'github.com', - :type => 'git', - :html_url => 'https://github.com/forge/plugin-arquillian', - :clone_url => 'git://github.com/forge/plugin-arquillian.git' + :path => 'plugin-arquillian', + :desc => nil, + :relative_path => '', + :owner => 'forge', + :host => 'github.com', + :type => 'git', + :html_url => 'https://github.com/forge/plugin-arquillian', + :clone_url => 'git://github.com/forge/plugin-arquillian.git' ) @repositories.sort! {|a,b| a.path <=> b.path } @@ -313,7 +313,6 @@ def execute(site) rekeyed_index.delete id puts "Merged #{unmatched_name} <#{unmatched_email}> with matched github id #{match.github_id} based on name" else - # TODO This should be logged to its own file - for manual data correction puts "Could not resolve github id for author #{unmatched_name} <#{unmatched_email}>" end end diff --git a/_ext/restclient_extensions.rb b/_ext/restclient_extensions.rb index f75474d349d..a9125ffde5c 100644 --- a/_ext/restclient_extensions.rb +++ b/_ext/restclient_extensions.rb @@ -147,7 +147,7 @@ class << cachedResponse response end - def cache_miss(response) + def cache_miss(response) if response.code == 200 and @cache and @request.method.eql? 'get' and @redirects.eql? @request.headers[:redirects] and !response.body.empty? puts "Cache miss because #{@cache_file} is missing or expired"