From 2793288984e72e98fd0b1cbf8c40a21f3a06eaee Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 7 Dec 2021 17:57:18 +0600 Subject: [PATCH 1/4] Added new script to get total available tickets from scraping, added selenium implemtation --- selenium_scraper.rb | 38 ++++++++++++++++ tour_site_scraper.rb | 105 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 selenium_scraper.rb create mode 100644 tour_site_scraper.rb diff --git a/selenium_scraper.rb b/selenium_scraper.rb new file mode 100644 index 0000000..c8efa90 --- /dev/null +++ b/selenium_scraper.rb @@ -0,0 +1,38 @@ +require 'rubygems' +require 'selenium-webdriver' +require 'pry' + +# This options are for headless execution of the browser so that it don't need to load browser +# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) +# driver = Selenium::WebDriver.for(:firefox, options: options) + +driver = Selenium::WebDriver.for :firefox +#Generate the search url physically using any date, time and put here, we will make it dynamic later +driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" + +#Take some time to load the page +# sleep(5) + +ticket_summary_button = wait.until { + elements = driver.find_element(:css, "#Act_Airline_Out") +} + +ticket_summary_button.click + +#Take some time after click on second tab to load result html +# sleep(5) + +#Find available information and search companies name, this is optional +ticket_summary = driver.find_elements(:class, "airline-name") + +#Find available ticke count element +ticket_available_lists = driver.find_elements(:class, "toggle-btn-company") + +#Find each companies available ticket and sum to get total available tickets +total_available_ticket = 0 +ticket_available_lists.each do |ticket_count| + total_available_ticket += ticket_count.text.delete("^0-9").to_i +end + +puts "Available ticket companies name = " + ticket_summary.first.text + ", " + ticket_summary.last.text +puts "Total available ticket found is = " + total_available_ticket.to_s \ No newline at end of file diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb new file mode 100644 index 0000000..32498f8 --- /dev/null +++ b/tour_site_scraper.rb @@ -0,0 +1,105 @@ +require 'rubygems' +require 'selenium-webdriver' +require 'pry' + + +# tour_site = Selenium::WebDriver.for :firefox +# tour_site.get "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211221&dpt_out=TYO" + +# options = Selenium::WebDriver::Chrome::Options.new +# options.add_argument('--headless') +# driver = Selenium::WebDriver.for :firefox, options: options + +# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) +# driver = Selenium::WebDriver.for(:firefox, options: options) + +driver = Selenium::WebDriver.for :firefox + +# driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211221&dpt_out=TYO" +driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" +wait = Selenium::WebDriver::Wait.new(:timeout => 10000) +sleep(10) +# driver.find_element(:css, "#Act_Airline_Out").click + +# new WebDriverWait(driver, 20).until(ExpectedConditions.elementToBeClickable(By.cssSelector("##Act_Airline_Out"))).click(); + +ticket_summary_button = wait.until { + elements = driver.find_element(:css, "#Act_Airline_Out") +} + +# ticket_summary_button = wait.until { +# elements = driver.find_elements(:css, "#Act_Airline_Out") +# } + +# binding.pry + +# puts driver + + +# ticket_details = driver.find_elements(:class, "ticket-detail-type-text-ellipsis") + +sleep(10) + +# ticket_details = wait.until { +# elements = driver.find_elements(:class, "ticket-detail-type-text-ellipsis") +# } + + +# ticket_summary = wait.until { +# elements = driver.find_elements(:class, "airline-name") +# } + +ticket_summary_button.click + +sleep(5) + +ticket_summary = driver.find_elements(:class, "airline-name") +ticket_available_lists = driver.find_elements(:class, "toggle-btn-company") + +# binding.pry + +available_ticket = 0 + +ticket_available_lists.each do |ticket_count| + available_ticket += ticket_count.text.delete("^0-9").to_i +end +puts "available ticket companies = " + ticket_summary.first.text + ", " + ticket_summary.last.text +puts "Total available ticket found is = " + available_ticket.to_s + + +# binding.pry + + +# ticket_details = tour_site.find_element(:css, ".tbl-list-detail > .ticket-detail-list .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis") +# available_ticket = 0 +# ticket_details.each do |ticket_info| + +# splited_ticket = ticket_info.text.to_s.split("|")[1] || ticket_info.text.to_s.split("ー")[1] + +# available_ticket += splited_ticket.to_i +# binding.pry +# puts ticket_info.text +# end + +# puts available_ticket + +# binding.pry + + +# loop do +# driver.find_elements(:css, ".p2 div a").each {|link| link.click} +# driver.find_elements(:css, ".p3 a, .firm , .p2 div").each { +# |n,r,c| +# name = n +# region = r +# contacts = c + +# print name.text.center(100) +# puts region +# puts contacts + +# } +# link = driver.find_element(:xpath, "/html/body/table[5]/tbody/tr/td/a[2]" )[:href] +# break if link == "http://www.ypag.ru/cat/komp249/page3780.html" +# driver.get "#{link}" +# end \ No newline at end of file From f500d9c8608affa1c2c3591a1a3845714ebd5f2d Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 7 Dec 2021 18:12:30 +0600 Subject: [PATCH 2/4] Added new script to get total available tickets from scraping, added selenium implemtation --- selenium_scraper.rb | 27 +++++++------ tour_site_scraper.rb | 94 +++++++------------------------------------- 2 files changed, 28 insertions(+), 93 deletions(-) diff --git a/selenium_scraper.rb b/selenium_scraper.rb index c8efa90..a68f176 100644 --- a/selenium_scraper.rb +++ b/selenium_scraper.rb @@ -7,32 +7,33 @@ # driver = Selenium::WebDriver.for(:firefox, options: options) driver = Selenium::WebDriver.for :firefox -#Generate the search url physically using any date, time and put here, we will make it dynamic later + +# driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211221&dpt_out=TYO" driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" +wait = Selenium::WebDriver::Wait.new(:timeout => 10000) #Take some time to load the page -# sleep(5) +sleep(10) ticket_summary_button = wait.until { elements = driver.find_element(:css, "#Act_Airline_Out") } +sleep(10) + ticket_summary_button.click -#Take some time after click on second tab to load result html -# sleep(5) +sleep(5) -#Find available information and search companies name, this is optional ticket_summary = driver.find_elements(:class, "airline-name") - -#Find available ticke count element ticket_available_lists = driver.find_elements(:class, "toggle-btn-company") -#Find each companies available ticket and sum to get total available tickets -total_available_ticket = 0 +# binding.pry + +available_ticket = 0 + ticket_available_lists.each do |ticket_count| - total_available_ticket += ticket_count.text.delete("^0-9").to_i + available_ticket += ticket_count.text.delete("^0-9").to_i end - -puts "Available ticket companies name = " + ticket_summary.first.text + ", " + ticket_summary.last.text -puts "Total available ticket found is = " + total_available_ticket.to_s \ No newline at end of file +puts "available ticket companies = " + ticket_summary.first.text + ", " + ticket_summary.last.text +puts "Total available ticket found is = " + available_ticket.to_s \ No newline at end of file diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 32498f8..2cb2059 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -2,104 +2,38 @@ require 'selenium-webdriver' require 'pry' - -# tour_site = Selenium::WebDriver.for :firefox -# tour_site.get "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211221&dpt_out=TYO" - -# options = Selenium::WebDriver::Chrome::Options.new -# options.add_argument('--headless') -# driver = Selenium::WebDriver.for :firefox, options: options - +# This options are for headless execution of the browser so that it don't need to load browser # options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) # driver = Selenium::WebDriver.for(:firefox, options: options) driver = Selenium::WebDriver.for :firefox - -# driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211221&dpt_out=TYO" +#Generate the search url physically using any date, time and put here, we will make it dynamic later driver.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" wait = Selenium::WebDriver::Wait.new(:timeout => 10000) -sleep(10) -# driver.find_element(:css, "#Act_Airline_Out").click -# new WebDriverWait(driver, 20).until(ExpectedConditions.elementToBeClickable(By.cssSelector("##Act_Airline_Out"))).click(); +#Take some time to load the page +sleep(2) ticket_summary_button = wait.until { elements = driver.find_element(:css, "#Act_Airline_Out") } -# ticket_summary_button = wait.until { -# elements = driver.find_elements(:css, "#Act_Airline_Out") -# } - -# binding.pry - -# puts driver - - -# ticket_details = driver.find_elements(:class, "ticket-detail-type-text-ellipsis") - -sleep(10) - -# ticket_details = wait.until { -# elements = driver.find_elements(:class, "ticket-detail-type-text-ellipsis") -# } - - -# ticket_summary = wait.until { -# elements = driver.find_elements(:class, "airline-name") -# } - ticket_summary_button.click -sleep(5) +#Take some time after click on second tab to load ajax html content +sleep(3) +#Find available information and search companies name, this is optional ticket_summary = driver.find_elements(:class, "airline-name") -ticket_available_lists = driver.find_elements(:class, "toggle-btn-company") - -# binding.pry -available_ticket = 0 +#Find available ticke count element +ticket_available_lists = driver.find_elements(:class, "toggle-btn-company") +#Find each companies available ticket and sum to get total available tickets +total_available_ticket = 0 ticket_available_lists.each do |ticket_count| - available_ticket += ticket_count.text.delete("^0-9").to_i + total_available_ticket += ticket_count.text.delete("^0-9").to_i end -puts "available ticket companies = " + ticket_summary.first.text + ", " + ticket_summary.last.text -puts "Total available ticket found is = " + available_ticket.to_s - - -# binding.pry - - -# ticket_details = tour_site.find_element(:css, ".tbl-list-detail > .ticket-detail-list .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis") -# available_ticket = 0 -# ticket_details.each do |ticket_info| - -# splited_ticket = ticket_info.text.to_s.split("|")[1] || ticket_info.text.to_s.split("ー")[1] - -# available_ticket += splited_ticket.to_i -# binding.pry -# puts ticket_info.text -# end - -# puts available_ticket - -# binding.pry - - -# loop do -# driver.find_elements(:css, ".p2 div a").each {|link| link.click} -# driver.find_elements(:css, ".p3 a, .firm , .p2 div").each { -# |n,r,c| -# name = n -# region = r -# contacts = c - -# print name.text.center(100) -# puts region -# puts contacts -# } -# link = driver.find_element(:xpath, "/html/body/table[5]/tbody/tr/td/a[2]" )[:href] -# break if link == "http://www.ypag.ru/cat/komp249/page3780.html" -# driver.get "#{link}" -# end \ No newline at end of file +puts "Available ticket companies name = " + ticket_summary.first.text + ", " + ticket_summary.last.text +puts "Total available ticket found is = " + total_available_ticket.to_s \ No newline at end of file From ea67157546ca5b90bdcae27a704f3d54d767c552 Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 7 Dec 2021 18:34:56 +0600 Subject: [PATCH 3/4] Readme file updated to add some instructions for project running --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ec1ad9..01e7ee9 100644 --- a/README.md +++ b/README.md @@ -1 +1,4 @@ -# ruby-scrapping \ No newline at end of file +# rubysite scrapping process + +Run `bundle install` +Run `ruby tour_site_scraper.rb` \ No newline at end of file From 91b04c11466b4c73f09af362efc412b6ed46ae6c Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 7 Dec 2021 18:42:51 +0600 Subject: [PATCH 4/4] Waiting time increased by 2 to ensure --- tour_site_scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 2cb2059..80048ca 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -12,7 +12,7 @@ wait = Selenium::WebDriver::Wait.new(:timeout => 10000) #Take some time to load the page -sleep(2) +sleep(5) ticket_summary_button = wait.until { elements = driver.find_element(:css, "#Act_Airline_Out")