From 4e90cc6417c0922acf431a5412d58300733ae695 Mon Sep 17 00:00:00 2001 From: Rasel Date: Fri, 10 Dec 2021 19:39:44 +0600 Subject: [PATCH 01/17] Scraping tickets details and all flights information for between two dates, related code updated --- tour_site_scraper.rb | 126 ++++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 36 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 78a044b..fc12718 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -2,52 +2,106 @@ require 'selenium-webdriver' require 'pry' -# This options are for headless execution of the browser so that it don't need to load browser +MAX_RETRY = 100 +WAIT = Selenium::WebDriver::Wait.new(timeout: 20) + # options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) # driver = Selenium::WebDriver.for(:firefox, options: options) - driver = Selenium::WebDriver.for :firefox + puts 'Trying to fetch data from site.....' puts '--------------------------------------------------------' -# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement -driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' -sleep(1) # Wait 1s to load the page properly - -MAX_RETRY = 100 -begin - retries ||= 0 - ticket_summary_button = driver.find_element(:css, '#Act_Airline_Out') - return if ticket_summary_button.nil? - ticket_summary_button.click -rescue Exception => e - puts 'Trying to fetch data.. ' + retries.to_s - retries += 1 - sleep(1) # Wait 1s to load the page properly - retry if (retries <= MAX_RETRY) - raise "Could not get ticket website information: Please give necessary information to search" +def check_return_tickets_visibility(driver) + begin + # Wait for few seconds until able to find return tickets list + WAIT.until { driver.find_element(css: "#Act_response_in .company-list").displayed? } + rescue Exception + end end -# Take some time after click to load ajax content until search element can be found -loop do - sleep(1) - if !driver.find_elements(:class, 'airline-name').nil? - break +ticket_search_date_from = Date.new(2021, 12, 31) +ticket_search_date_to = Date.new(2022, 01, 31) +ticket_search_date_from.upto(ticket_search_date_to) do |dt| + departure_date_in = dt.to_s.delete("-") + departure_date_out = dt.to_s.delete("-") + + puts "\n\nTickets for this date " + dt.to_s + + # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement + driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' + sleep(1) # Wait 1s to load the page properly + begin + retries ||= 0 + ticket_summary_button_out = nil + ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out') + ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In') + return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil? + ticket_summary_button_out.click + ticket_summary_button_in.click + + WAIT.until { driver.find_element(css: "#Act_response_out .company-list").displayed? } + rescue Exception => e + puts 'Trying to fetch data.. ' + retries.to_s + retries += 1 + sleep(1) # Wait 1s to load the page properly + retry if (retries <= MAX_RETRY) + raise "Could not get ticket website information: Please give necessary information to search" end -end -# Find available information and available ticket list elements -ticket_summary = driver.find_elements(:class, 'airline-name') -ticket_available_lists = driver.find_elements(:class, 'toggle-btn-company') + check_return_tickets_visibility(driver) -# Parse elements to find each companies available ticket and sum to get total available tickets -total_available_ticket = 0 -!ticket_available_lists.nil? && ticket_available_lists.each do |ticket_count| - total_available_ticket += ticket_count.text.delete('^0-9').to_i -end + ticket_lists = driver.find_elements(:css, '#Act_response_in .company-list .company-box') + + ticket_lists.each do |ticket_list| + ticket_flight_lists = ticket_list.find_elements(:css, '.Act_flight_list') + # test = ticket_flight_lists.find_elements(:xpath, '//*[@id="Act_response_in"]/div/ul[contains(@class,"ticket-code")]') -puts 'Available ticket companies name = ' -!ticket_summary.nil? && ticket_summary.each do |ticket_cmpany| - puts ticket_cmpany.text.to_s + ', ' + ticket_flight_lists.each do |flight| + ticket_code = flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") + puts 'ticket codes are = ' + ticket_code + # flight.attribute("innerHTML") + # binding.pry + end + end + + +# binding.pry + + # Scrap Available Tickets Elements + ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name') + ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company') + + # Parse elements to find each companies available ticket and sum + total_available_ticket = 0 + !ticket_available_lists.nil? && ticket_available_lists.each do |ticket_count| + total_available_ticket += ticket_count.text.delete('^0-9').to_i + end + + # Scrap Returning Tickets Elements + ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name') + ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company') + + # Parse elements to find each companies returning tickets and sum + total_available_ticket_in = 0 + !ticket_available_lists_in.nil? && ticket_available_lists_in.each do |ticket_count_in| + total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i + end + + # Write all tickets search results + puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s + puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s + + puts 'Available ticket IN companies name : ' + puts '------------------------------------' + !ticket_summary_in.nil? && ticket_summary_in.each do |ticket_cmpany_in| + puts ticket_cmpany_in.text.to_s + ', ' + end + + puts + puts 'Available ticket OUT companies name : ' + puts '-------------------------------------' + !ticket_summary.nil? && ticket_summary.each do |ticket_cmpany| + puts ticket_cmpany.text.to_s + ', ' + end end -puts 'Total available ticket found is = ' + total_available_ticket.to_s From 9168f831f6c0a04efde74ec4023ecc625ab622df Mon Sep 17 00:00:00 2001 From: Rasel Date: Mon, 13 Dec 2021 18:26:37 +0600 Subject: [PATCH 02/17] Ticket details scraping implemented for gettings all tickets information --- tour_site_scraper.rb | 99 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index fc12718..9fe73e8 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -20,6 +20,47 @@ def check_return_tickets_visibility(driver) end end +def scrap_ticket_details(driver, ticket_details_type) + if ticket_details_type == 'in' + ticket_lists = driver.find_elements(:css, '#Act_response_in .company-list .company-box') + else + ticket_lists = driver.find_elements(:css, '#Act_response_out .company-list .company-box') + end + + total_ticket_found = 0 + all_tickets_details_lists = [] + ticket_lists&.each do |ticket_list| + temp_ticket_company_info = {} + number_of_ticket_found = 0 + + ticket_company_name = ticket_list.find_element(:css, '.airline-name').text + number_of_ticket_found = ticket_list.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i + total_ticket_found += number_of_ticket_found + ticket_minimum_price = ticket_list.find_element(:css, '.hdg-sup-price > b').text + + temp_ticket_company_info[:ticket_company_name] = ticket_company_name + temp_ticket_company_info[:ticket_minimum_price] = ticket_minimum_price + temp_ticket_company_info[:number_of_ticket_found] = number_of_ticket_found + + flight_lists = [] + ticket_company_lists = ticket_list.find_elements(:css, '.Act_flight_list') + ticket_company_lists&.each do |flight| + ticket_code = flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") + ticket_price = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") + ticket_seat = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") + ticket_changable_status = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") + ticket_type = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + flight_data = {} + flight_data['flight_code'] = ticket_code + flight_data['flight_price'] = ticket_price + flight_lists.push(flight_data) + end + temp_ticket_company_info[:flight_lists] = flight_lists + all_tickets_details_lists.push(temp_ticket_company_info) + end + return all_tickets_details_lists, total_ticket_found +end + ticket_search_date_from = Date.new(2021, 12, 31) ticket_search_date_to = Date.new(2022, 01, 31) ticket_search_date_from.upto(ticket_search_date_to) do |dt| @@ -51,57 +92,15 @@ def check_return_tickets_visibility(driver) check_return_tickets_visibility(driver) - ticket_lists = driver.find_elements(:css, '#Act_response_in .company-list .company-box') - - ticket_lists.each do |ticket_list| - ticket_flight_lists = ticket_list.find_elements(:css, '.Act_flight_list') - # test = ticket_flight_lists.find_elements(:xpath, '//*[@id="Act_response_in"]/div/ul[contains(@class,"ticket-code")]') - - ticket_flight_lists.each do |flight| - ticket_code = flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") - puts 'ticket codes are = ' + ticket_code - # flight.attribute("innerHTML") - # binding.pry - end - end - + tickets_out_list = scrap_ticket_details(driver, 'out') + all_ticket_out_lists = tickets_out_list[0] + total_ticket_out_found = tickets_out_list[1] -# binding.pry + tickets_in_list = scrap_ticket_details(driver, 'in') + all_ticket_in_details = tickets_in_list[0] + total_ticket_in_found = tickets_in_list[1] - # Scrap Available Tickets Elements - ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name') - ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company') + puts "Total tickets found for out is = " + total_ticket_out_found.to_s + puts "Total tickets found for in is = " + total_ticket_in_found.to_s - # Parse elements to find each companies available ticket and sum - total_available_ticket = 0 - !ticket_available_lists.nil? && ticket_available_lists.each do |ticket_count| - total_available_ticket += ticket_count.text.delete('^0-9').to_i - end - - # Scrap Returning Tickets Elements - ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name') - ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company') - - # Parse elements to find each companies returning tickets and sum - total_available_ticket_in = 0 - !ticket_available_lists_in.nil? && ticket_available_lists_in.each do |ticket_count_in| - total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i - end - - # Write all tickets search results - puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s - puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s - - puts 'Available ticket IN companies name : ' - puts '------------------------------------' - !ticket_summary_in.nil? && ticket_summary_in.each do |ticket_cmpany_in| - puts ticket_cmpany_in.text.to_s + ', ' - end - - puts - puts 'Available ticket OUT companies name : ' - puts '-------------------------------------' - !ticket_summary.nil? && ticket_summary.each do |ticket_cmpany| - puts ticket_cmpany.text.to_s + ', ' - end end From 67bb30eddcb1a9e38fb7fb0d4b8fb2ca46afd0a9 Mon Sep 17 00:00:00 2001 From: Rasel Date: Mon, 13 Dec 2021 23:19:09 +0600 Subject: [PATCH 03/17] Refactored code --- tour_site_scraper.rb | 66 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 9fe73e8..a568463 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -4,27 +4,49 @@ MAX_RETRY = 100 WAIT = Selenium::WebDriver::Wait.new(timeout: 20) - +WEB_DRIVER = Selenium::WebDriver.for :firefox # options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) # driver = Selenium::WebDriver.for(:firefox, options: options) -driver = Selenium::WebDriver.for :firefox + puts 'Trying to fetch data from site.....' puts '--------------------------------------------------------' -def check_return_tickets_visibility(driver) +def start_scraping(departure_date_in, departure_date_out) + # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement + WEB_DRIVER.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' + sleep(1) # Wait 1s to load the page properly + begin + retries ||= 0 + ticket_summary_button_out = nil + ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out') + ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In') + return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil? + ticket_summary_button_out.click + ticket_summary_button_in.click + rescue Exception => e + puts 'Trying to fetch data.. ' + retries.to_s + retries += 1 + sleep(1) # Wait 1s to load the page properly + retry if (retries <= MAX_RETRY) + raise "Could not get ticket website information: Please give necessary information to search" + end +end + +def check_return_tickets_visibility begin # Wait for few seconds until able to find return tickets list - WAIT.until { driver.find_element(css: "#Act_response_in .company-list").displayed? } + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? } + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? } rescue Exception end end -def scrap_ticket_details(driver, ticket_details_type) +def scrap_ticket_details(ticket_details_type) if ticket_details_type == 'in' - ticket_lists = driver.find_elements(:css, '#Act_response_in .company-list .company-box') + ticket_lists = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box') else - ticket_lists = driver.find_elements(:css, '#Act_response_out .company-list .company-box') + ticket_lists = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box') end total_ticket_found = 0 @@ -68,39 +90,17 @@ def scrap_ticket_details(driver, ticket_details_type) departure_date_out = dt.to_s.delete("-") puts "\n\nTickets for this date " + dt.to_s + start_scraping(departure_date_in, departure_date_out) + check_return_tickets_visibility - # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement - driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' - sleep(1) # Wait 1s to load the page properly - begin - retries ||= 0 - ticket_summary_button_out = nil - ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out') - ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In') - return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil? - ticket_summary_button_out.click - ticket_summary_button_in.click - - WAIT.until { driver.find_element(css: "#Act_response_out .company-list").displayed? } - rescue Exception => e - puts 'Trying to fetch data.. ' + retries.to_s - retries += 1 - sleep(1) # Wait 1s to load the page properly - retry if (retries <= MAX_RETRY) - raise "Could not get ticket website information: Please give necessary information to search" - end - - check_return_tickets_visibility(driver) - - tickets_out_list = scrap_ticket_details(driver, 'out') + tickets_out_list = scrap_ticket_details('out') all_ticket_out_lists = tickets_out_list[0] total_ticket_out_found = tickets_out_list[1] - tickets_in_list = scrap_ticket_details(driver, 'in') + tickets_in_list = scrap_ticket_details('in') all_ticket_in_details = tickets_in_list[0] total_ticket_in_found = tickets_in_list[1] puts "Total tickets found for out is = " + total_ticket_out_found.to_s puts "Total tickets found for in is = " + total_ticket_in_found.to_s - end From efeea1dc58f66f5ec227c34fdece6c3a32315872 Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 14 Dec 2021 14:02:31 +0600 Subject: [PATCH 04/17] scraper implementation code refactored for avoiding ajax error issue --- tour_site_scraper.rb | 96 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index a568463..01a8e5f 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -2,13 +2,18 @@ require 'selenium-webdriver' require 'pry' -MAX_RETRY = 100 -WAIT = Selenium::WebDriver::Wait.new(timeout: 20) +MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds +MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown +# Put Ticket Search input dates here +TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31) +TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31) + +WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html WEB_DRIVER = Selenium::WebDriver.for :firefox + # options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) # driver = Selenium::WebDriver.for(:firefox, options: options) - puts 'Trying to fetch data from site.....' puts '--------------------------------------------------------' @@ -33,73 +38,70 @@ def start_scraping(departure_date_in, departure_date_out) end end -def check_return_tickets_visibility - begin - # Wait for few seconds until able to find return tickets list - WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? } - WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? } - rescue Exception - end -end - -def scrap_ticket_details(ticket_details_type) +def searching_ticket_type(ticket_details_type) if ticket_details_type == 'in' - ticket_lists = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box') + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box') else - ticket_lists = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box') + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box') end total_ticket_found = 0 all_tickets_details_lists = [] - ticket_lists&.each do |ticket_list| - temp_ticket_company_info = {} + ticket_airlines&.each do |ticket_airline| + temp_ticket_airline_info = {} number_of_ticket_found = 0 - ticket_company_name = ticket_list.find_element(:css, '.airline-name').text - number_of_ticket_found = ticket_list.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i + ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text + number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i total_ticket_found += number_of_ticket_found - ticket_minimum_price = ticket_list.find_element(:css, '.hdg-sup-price > b').text + ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text - temp_ticket_company_info[:ticket_company_name] = ticket_company_name - temp_ticket_company_info[:ticket_minimum_price] = ticket_minimum_price - temp_ticket_company_info[:number_of_ticket_found] = number_of_ticket_found + temp_ticket_airline_info[:ticket_company_name] = ticket_company_name + temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price + temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found - flight_lists = [] - ticket_company_lists = ticket_list.find_elements(:css, '.Act_flight_list') - ticket_company_lists&.each do |flight| - ticket_code = flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") - ticket_price = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") - ticket_seat = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") - ticket_changable_status = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") - ticket_type = flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + ticket_flight_lists = [] + ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list') + ticket_airline_flights_lists&.each do |ticket_flight| flight_data = {} - flight_data['flight_code'] = ticket_code - flight_data['flight_price'] = ticket_price - flight_lists.push(flight_data) + flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") + flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") + flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") + flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") + flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + ticket_flight_lists.push(flight_data) end - temp_ticket_company_info[:flight_lists] = flight_lists - all_tickets_details_lists.push(temp_ticket_company_info) + temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists + all_tickets_details_lists.push(temp_ticket_airline_info) end return all_tickets_details_lists, total_ticket_found end -ticket_search_date_from = Date.new(2021, 12, 31) -ticket_search_date_to = Date.new(2022, 01, 31) -ticket_search_date_from.upto(ticket_search_date_to) do |dt| +TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt| departure_date_in = dt.to_s.delete("-") departure_date_out = dt.to_s.delete("-") puts "\n\nTickets for this date " + dt.to_s - start_scraping(departure_date_in, departure_date_out) - check_return_tickets_visibility - tickets_out_list = scrap_ticket_details('out') - all_ticket_out_lists = tickets_out_list[0] - total_ticket_out_found = tickets_out_list[1] + begin + retries ||= 0 + start_scraping(departure_date_in, departure_date_out) + # Wait for few seconds until able to find return tickets list + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? } + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? } + tickets_out_lists = searching_ticket_type('out') + tickets_in_lists = searching_ticket_type('in') + rescue Exception + puts "retris in check visibility=====" + retries.to_s + retries += 1 + retry if (retries <= MAX_CALL) + raise "Could not get ticket website information: Please give necessary information to search" + end - tickets_in_list = scrap_ticket_details('in') - all_ticket_in_details = tickets_in_list[0] - total_ticket_in_found = tickets_in_list[1] + all_ticket_out_lists = tickets_out_lists[0] + total_ticket_out_found = tickets_out_lists[1] + all_ticket_in_details = tickets_in_lists[0] + total_ticket_in_found = tickets_in_lists[1] puts "Total tickets found for out is = " + total_ticket_out_found.to_s puts "Total tickets found for in is = " + total_ticket_in_found.to_s From 27998355810bde1a99d9c103fddf20da20e254a0 Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 14 Dec 2021 15:28:42 +0600 Subject: [PATCH 05/17] scraping site url line break added --- tour_site_scraper.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 01a8e5f..7f27c72 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -19,8 +19,10 @@ def start_scraping(departure_date_in, departure_date_out) # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement - WEB_DRIVER.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' - sleep(1) # Wait 1s to load the page properly + WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" + + "date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" + + "CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" + sleep(1) begin retries ||= 0 ticket_summary_button_out = nil From 332c148a54f8b56de10d8f5edb8a541a092f83b4 Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 14 Dec 2021 15:30:10 +0600 Subject: [PATCH 06/17] Removed extra line --- tour_site_scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 7f27c72..bf60ecc 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -4,6 +4,7 @@ MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown + # Put Ticket Search input dates here TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31) TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31) @@ -94,7 +95,6 @@ def searching_ticket_type(ticket_details_type) tickets_out_lists = searching_ticket_type('out') tickets_in_lists = searching_ticket_type('in') rescue Exception - puts "retris in check visibility=====" + retries.to_s retries += 1 retry if (retries <= MAX_CALL) raise "Could not get ticket website information: Please give necessary information to search" From 6954db3b49d6322daac4a8a29533303a49920c34 Mon Sep 17 00:00:00 2001 From: Rasel Date: Tue, 14 Dec 2021 19:17:13 +0600 Subject: [PATCH 07/17] DB schema added for scraper data save --- Gemfile | 1 + Gemfile.lock | 6 ++++++ database.rb | 37 +++++++++++++++++++++++++++++++++++++ db_tour_scraper.db | Bin 0 -> 16384 bytes 4 files changed, 44 insertions(+) create mode 100644 database.rb create mode 100644 db_tour_scraper.db diff --git a/Gemfile b/Gemfile index 9004097..59faf60 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,4 @@ gem 'watir', '~> 6.19', '>= 6.19.1' gem 'webdrivers', '~> 4.6' gem 'nokogiri', '~> 1.11', '>= 1.11.7' gem 'geckodriver-helper', '~> 0.0.3' +gem 'sqlite3' diff --git a/Gemfile.lock b/Gemfile.lock index 8a88c71..15cda8e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -7,6 +7,10 @@ GEM geckodriver-helper (0.0.5) archive-zip (~> 0.7) io-like (0.3.1) + mini_portile2 (2.6.1) + nokogiri (1.12.5) + mini_portile2 (~> 2.6.1) + racc (~> 1.4) nokogiri (1.12.5-x86_64-linux) racc (~> 1.4) racc (1.6.0) @@ -17,6 +21,7 @@ GEM childprocess (>= 0.5, < 5.0) rexml (~> 3.2, >= 3.2.5) rubyzip (>= 1.2.2) + sqlite3 (1.4.2) watir (6.19.1) regexp_parser (>= 1.2, < 3) selenium-webdriver (>= 3.142.7) @@ -32,6 +37,7 @@ PLATFORMS DEPENDENCIES geckodriver-helper (~> 0.0.3) nokogiri (~> 1.11, >= 1.11.7) + sqlite3 watir (~> 6.19, >= 6.19.1) webdrivers (~> 4.6) diff --git a/database.rb b/database.rb new file mode 100644 index 0000000..bf44f39 --- /dev/null +++ b/database.rb @@ -0,0 +1,37 @@ +require 'sqlite3' + +db = SQLite3::Database.open "db_tour_scraper.db" + +db.execute <fgM7zglVnN+D_fW%=}b8EC04WS(ogyN2BT?<}?w3B5T7cBBpB`#BstM>gk z^KEd_m^N7j95_{fOG#uuk7N61Uwr*>ptK~@G>e5MFP*23>pE`;aU91p&%SwVY1UmE z_`}z8PGA2#FbUqHAI_7}CLf;Xf^pJ zHMgDCiA8WV9G|42B%cD-eIKyS`I)~Fnq_Jt?Y}qL6A`bwyf~}%sfN^x*~66Hh|&JV zu$W!v!PI=hXpb^F?9%IM;<=r9-ch07_Lp?8b$D{(-oGl{R=-#$HS1nYt{MF|x_lnT zB3tk}odunw@k}HOz7?ceVU*s=T=SJ>Z!ikR<<6v9M0Ury_%2jb9EDBnvj2iNB+LET z0=b^(R>&&6a=bfr%k#He_x~N Date: Tue, 14 Dec 2021 19:24:38 +0600 Subject: [PATCH 08/17] DB schema column name updated --- database.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database.rb b/database.rb index bf44f39..b5cd38f 100644 --- a/database.rb +++ b/database.rb @@ -18,7 +18,7 @@ airline_company_name VARCHAR(100), ticket_lowest_price FLOAT, total_flights_available INTEGER, - tickey_type VARCHAR(10), + ticket_type VARCHAR(10), FOREIGN KEY(ticket_summary_id) REFERENCES tickets_summary(id) ); SQL From 5c3820db6e809bd11735ebbd30b13229c365b34c Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 14:23:22 +0600 Subject: [PATCH 09/17] Scraping data saving function added, updated schema --- README.md | 2 ++ database.rb | 37 ---------------------------------- db_tour_scraper.db | Bin 16384 -> 28672 bytes tour_site_scraper.rb | 46 +++++++++++++++++++++++++++++++++++-------- 4 files changed, 40 insertions(+), 45 deletions(-) delete mode 100644 database.rb diff --git a/README.md b/README.md index 9f6e735..b86eeaa 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ # rubysite scrapping process Run `bundle install` +Run `ruby migration.rb` Run `ruby tour_site_scraper.rb` + diff --git a/database.rb b/database.rb deleted file mode 100644 index b5cd38f..0000000 --- a/database.rb +++ /dev/null @@ -1,37 +0,0 @@ -require 'sqlite3' - -db = SQLite3::Database.open "db_tour_scraper.db" - -db.execute <7;QFqujCPRyrf;~Af9S!Llk%MPE7y(A$ z^AgAo+Z>HuUEJC6O#JvvBA!gmBR0V%g)2UQ)Dv^&=^!(Jx_}o)u9I@p&uw(Sh=~IOOqF_Ym z8ySoC?;a*;cixw!-!~TRi;hKyd!yqxhID=q>CWV|-{sHFSsjf%J>1!+DNv(}B@qm2 z_EdZ>nMhY`u$4frDEmCP(+jh+@zm*9Den9v7^$6(&BbT$$ATzk=AVPqV+Q}e!I6kD zoAa6YOn%M8;?KsDGh`BqCkXw^kkdIWL%fh^S>o7i03WTflLb6Uai^b|fgNcqop@#; zF*lW1_Rv~_c3ER58Fv7j{%kvE923@(Da5Gx8Nr7&C4DYy{GTRiI&mtV$}FT3vFSMM zs&~OJvb0kPcy6xrxlD2vj?vWoY;1lZv$V)1V+GU3%zU|~bRwRbIvFDh+W!RHAu%#^ z0`tj}KJh5NzcD$NZ*%#{-kjs#x4=JkV+0rhMt~7u1Q-EEfDvE>7y(9r5nu!ufiEzD zZJg86+PZeF%yE(+iQS^qEsKSFf+h$;P!RIZh^kLiwu!-QqFi^_!~fo5Mt~7u1Q-EEfDvE>7y(9r5nu!ufiES2^)=QWE?>_t{eTC2_Qc&_wRUm& zr}48sQvhN^Wf-eDYnL7=dQD!{Tgc;#vnHgOVH&mEZ%Z`GvXZSSFP`)m)E_ zk>Q9W1ynh-xzO0*g2paz>e(Ob*&pe-_w?L(J$qTy0;(o>3c@W;5QA6=( zKe;o(fFuj{f+AoCMUW`6*Y)h1dM>Bu&goeQ{~Iu7y(9r5nu!u0Y-okU<4QeMt~9cauKi(lluR# z<*J4M9sde{l;7h0m-j=c{m*-MdpCIQc|P)7@_f}Z;R$$b?mxIMx)W}p{Mzuf z)_2xhTz_`G>pJH;<_fu1IsfXs>in8>(Aiw~x4Ij3=jsmDwL9)P-gmt0IO5oB|A+k- z_OILf?B3dsYhSOOt<`M*v3+2B(RR@0v);A-)Oy;w)9R}EQ_X8N(>2}PJ?_`s3)}>^ z-m(bEE#1^%xb)G~)D|3qOCLf=GOjBvUOIpKwV!`-{hNy~zH|Hg-&nkQ{?juT?q180 z3pV6>(W}}J*qXsM2sR}sYbJf_AcosZ!UZ87xs0`9IAJdh*CGy(aFU?G6&xuj7(SBY z8&^KL{@&u*D~q?DUp)V_#hc$&2QW}S4P-Vytslp=#~7Ds9(&NY8+@W7n)Ru>G0i@j zreczZ+K09%wV93P}2ELekVshl^mi z9Y&gF=Um-^;d*E|vo@^6E`(^f7BNWE6p1ff^~9hBF+t5p&@2#D z!vs~DpjrHyis2L^LDN13eKPeymcc+M6kPF4smhq9MAI~jUz0GLXryV@r-~R(pyABI z)C3H-)d*+Sr*6e?^)#H>MY;76!`9Uu*)<|4icu3OX1Qxb&wWeJUD0!wp&}t~?M6d@ z803%$H4hD6Zj4vO<%R$nx~M@DL!|0bEkYL>worp4i2?XlVKu`RH28@D&cWhJ5&UTA zqy}weLnj(Ks6koT(1C__YJe|Fu0*LF4Q<2#2kVN4HZ*Le272&Rr?eRjn`lY|>k9L* z2@M-*1iJ34N7#r47d4QTTYci%TdAR?#W%ipGAIj1kKRn#@3?qRB{oZrK! za}piAo>dq}xCwNPjcvUT^$&z(g;pPnj}txj7AYuZe?n`RH{i`FJ&v#G*>}Sap|y@! z;S36eOOsZ)(Xa*|92!!?0bJu9Fx0mXk@`Gqv3nn;QlQQVarnP8VLrj`U z(;7525EChknu*karqy5yO5uQD#TD4k2EU)8H=cWB=6?Ip`8H=!6iQt%Y)NJD?m- zc1O+iRHunbKq#AF2bAWi&_#ucC^`TDMC9H!U|MN7cZjO z3MG2dVOK3ZABjehyg2f*m#TIs*Mp&@#YT@i*{GBohT`*)ABy%w*Jg!^xhjZ8(R@jV zTP|a>MJo1W6KSI_%dq<~wW||J{T?=v-B#mWfGFR0E`^4Y7iN`|@ku07sSk&OCL<|O z;CB)mWhsl{fGCK?jLL3u6;k+La*0Nze?=GB|F3mFYT-ZRU*Zq(8@>OA&;I|=`>eOe zyTdA&uaG__f7Xp?h_zoH%5RFU<4QeMt~7u1Q-EEfD!orBjBIHE!T0f z<&p*X1bFeKo#HK*|2RtHL<*M!t4QN0Jw~NkHR&;w#;8=OCXJ!=Nh+19NuNaNQ7WxE ztfMGBLZonou*$HGp!6`6Rvp%1lpdnes>3>j(n%_vP}NSZ9dtICz?MmF*j8q%rVH>!!bz1&|vavo-7|%CU$XMUB(vMlEkE( z)RN5P?9`It_{7YjoXot`Vst*UbC9cJh^s<~qmz%T0=n#EMSkhYOL@B)D<>QBg>W_M zF@oLEBs}>cU-V=zegmcalEj>Nn6|Y1(!3M}PrndXch?{;O^{a@1U3sgyyc%5z{`f0 F3jiv3OwIrR diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index aa02b24..20eaee8 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -1,6 +1,9 @@ require 'rubygems' require 'selenium-webdriver' require 'pry' +require 'sqlite3' + +DB = SQLite3::Database.new( "db_tour_scraper.db" ) MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown @@ -8,6 +11,8 @@ # Put Ticket Search input dates here TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31) TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31) +TIME_FROM_OUT = '0600' +TIME_TO_OUT = '0700' WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html WEB_DRIVER = Selenium::WebDriver.for :firefox @@ -22,7 +27,7 @@ def start_scraping(departure_date_in, departure_date_out) # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" + "date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" + - "CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" + "CTS&dpt_out=TYO&time_from_out=#{TIME_FROM_OUT}&time_to_out=#{TIME_TO_OUT}&time_type_out=0" sleep(1) begin retries ||= 0 @@ -80,6 +85,36 @@ def searching_ticket_type(ticket_details_type) return all_tickets_details_lists, total_ticket_found end +def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date) + all_ticket_out_lists = tickets_out_lists[0] + all_ticket_in_lists = tickets_in_lists[0] + total_ticket_out_found = tickets_out_lists[1] + total_ticket_in_found = tickets_in_lists[1] + puts "Total tickets found for out is = " + total_ticket_out_found.to_s + puts "Total tickets found for in is = " + total_ticket_in_found.to_s + + DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", [nil, departure_date.to_s, return_date.to_s, TIME_FROM_OUT, TIME_TO_OUT, Time.now.strftime("%Y-%m-%d %H:%M:%S"), total_ticket_out_found, total_ticket_in_found]) + ticket_summary_id = DB.last_insert_row_id() + all_ticket_out_lists.each do |tickets_out| + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), tickets_out[:ticket_company_name], tickets_out[:ticket_minimum_price], tickets_out[:number_of_ticket_found], 'round_trip']) + tickets_out[:ticket_flight_lists].each do |flight| + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + end + end + + all_ticket_in_lists.each do |tickets_in| + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, ticket_summary_id, tickets_in[:ticket_company_name], tickets_in[:ticket_minimum_price], tickets_in[:number_of_ticket_found], 'round_trip']) + tickets_in[:ticket_flight_lists].each do |flight| + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + end + end + + rows = DB.execute( "select * from tickets_summary" ) + + # binding.pry + +end + TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt| departure_date_in = dt.to_s.delete("-") departure_date_out = dt.to_s.delete("-") @@ -100,11 +135,6 @@ def searching_ticket_type(ticket_details_type) raise "Could not get ticket website information: Please give necessary information to search" end - all_ticket_out_lists = tickets_out_lists[0] - total_ticket_out_found = tickets_out_lists[1] - all_ticket_in_details = tickets_in_lists[0] - total_ticket_in_found = tickets_in_lists[1] - - puts "Total tickets found for out is = " + total_ticket_out_found.to_s - puts "Total tickets found for in is = " + total_ticket_in_found.to_s + # Save scraped ticket details, initially departure date and return date is same + save_scrap_data(tickets_out_lists, tickets_in_lists, dt, dt) end \ No newline at end of file From 442d72d48eeb4f6dba2491231e022ee274a77144 Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 16:12:55 +0600 Subject: [PATCH 10/17] Migration file added, ticket company table insert query updated --- migration.rb | 41 +++++++++++++++++++++++++++++++++++++++++ tour_site_scraper.rb | 4 ++-- 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 migration.rb diff --git a/migration.rb b/migration.rb new file mode 100644 index 0000000..a2497e0 --- /dev/null +++ b/migration.rb @@ -0,0 +1,41 @@ +require 'sqlite3' + +db = SQLite3::Database.open "db_tour_scraper.db" + +db.execute < Date: Wed, 15 Dec 2021 16:35:52 +0600 Subject: [PATCH 11/17] ticket flight company_id, comapny table ticket_summary_id wrong issue fixed --- tour_site_scraper.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 3c020bf..f6ceb49 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -96,16 +96,18 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", [nil, departure_date.to_s, return_date.to_s, TIME_FROM_OUT, TIME_TO_OUT, Time.now.strftime("%Y-%m-%d %H:%M:%S"), total_ticket_out_found, total_ticket_in_found]) ticket_summary_id = DB.last_insert_row_id() all_ticket_out_lists.each do |tickets_out| - DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), tickets_out[:ticket_company_name], tickets_out[:ticket_minimum_price], tickets_out[:number_of_ticket_found], 'out']) + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, ticket_summary_id, tickets_out[:ticket_company_name], tickets_out[:ticket_minimum_price], tickets_out[:number_of_ticket_found], 'out']) + ticket_out_company_id = DB.last_insert_row_id() tickets_out[:ticket_flight_lists].each do |flight| - DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, ticket_out_company_id, flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) end end all_ticket_in_lists.each do |tickets_in| DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, ticket_summary_id, tickets_in[:ticket_company_name], tickets_in[:ticket_minimum_price], tickets_in[:number_of_ticket_found], 'in']) + ticket_in_company_id = DB.last_insert_row_id() tickets_in[:ticket_flight_lists].each do |flight| - DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, DB.last_insert_row_id(), flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, ticket_in_company_id, flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) end end From 05e2a20f5d33edfead25fe9aa9aa89f95dacdad0 Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 17:42:02 +0600 Subject: [PATCH 12/17] Added some clean code and comments --- tour_site_scraper.rb | 65 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index f6ceb49..089dd89 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -85,6 +85,8 @@ def searching_ticket_type(ticket_details_type) return all_tickets_details_lists, total_ticket_found end + +# Save tickets scraped data to database SQLite into different tables def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date) all_ticket_out_lists = tickets_out_lists[0] all_ticket_in_lists = tickets_in_lists[0] @@ -93,28 +95,73 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ puts "Total tickets found for out is = " + total_ticket_out_found.to_s puts "Total tickets found for in is = " + total_ticket_in_found.to_s - DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", [nil, departure_date.to_s, return_date.to_s, TIME_FROM_OUT, TIME_TO_OUT, Time.now.strftime("%Y-%m-%d %H:%M:%S"), total_ticket_out_found, total_ticket_in_found]) + # Save ticket summary + ticket_summary_data = [ + nil, + departure_date.to_s, + return_date.to_s, + TIME_FROM_OUT, + TIME_TO_OUT, + Time.now.strftime("%Y-%m-%d %H:%M:%S"), + total_ticket_out_found, total_ticket_in_found + ] + DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ) ticket_summary_id = DB.last_insert_row_id() + + # Save all available out/departure tickets comapny and comapnies flights data all_ticket_out_lists.each do |tickets_out| - DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, ticket_summary_id, tickets_out[:ticket_company_name], tickets_out[:ticket_minimum_price], tickets_out[:number_of_ticket_found], 'out']) + # Save company tickets informations + company_data = [ + nil, + ticket_summary_id, + tickets_out[:ticket_company_name], + tickets_out[:ticket_minimum_price], + tickets_out[:number_of_ticket_found], + 'out' + ] + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ) + + # Save ticket flights information ticket_out_company_id = DB.last_insert_row_id() tickets_out[:ticket_flight_lists].each do |flight| - DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, ticket_out_company_id, flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + flight_data = [ + nil, + ticket_out_company_id, + flight['flight_code'], + flight['flight_price'], + flight['flight_changable_status'], + flight['flight_type'] + ] + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data) end end + # Save all available in/return tickets comapny and comapnies flights data all_ticket_in_lists.each do |tickets_in| - DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", [nil, ticket_summary_id, tickets_in[:ticket_company_name], tickets_in[:ticket_minimum_price], tickets_in[:number_of_ticket_found], 'in']) + # Save company tickets informations + ticket_in_company_data = [ + nil, + ticket_summary_id, + tickets_in[:ticket_company_name], + tickets_in[:ticket_minimum_price], + tickets_in[:number_of_ticket_found], + 'in' + ] + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ticket_in_company_data) ticket_in_company_id = DB.last_insert_row_id() tickets_in[:ticket_flight_lists].each do |flight| - DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", [nil, ticket_in_company_id, flight['flight_code'], flight['flight_price'], flight['flight_changable_status'], flight['flight_type']]) + flight_data = [ + nil, + ticket_in_company_id, + flight['flight_code'], + flight['flight_price'], + flight['flight_changable_status'], + flight['flight_type'] + ] + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data) end end - rows = DB.execute( "select * from tickets_summary" ) - - # binding.pry - end TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt| From f8fbf27704c6d738d5efbc8cbb1cca545fa316d5 Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 18:17:49 +0600 Subject: [PATCH 13/17] refactoring added, fixed some missing variable issues, request changes updated --- migration.rb | 3 +-- tour_site_scraper.rb | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/migration.rb b/migration.rb index a2497e0..cbf5a21 100644 --- a/migration.rb +++ b/migration.rb @@ -37,5 +37,4 @@ flight_ticket_type VARCHAR(100), FOREIGN KEY(ticket_airline_id) REFERENCES tickets_airlines(id) ); -SQL - +SQL \ No newline at end of file diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 089dd89..2861dc5 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -3,7 +3,7 @@ require 'pry' require 'sqlite3' -DB = SQLite3::Database.new( "db_tour_scraper.db" ) +DB = SQLite3::Database.new("db_tour_scraper.db") MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown @@ -76,7 +76,8 @@ def searching_ticket_type(ticket_details_type) flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") - flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + flight_data['flight_type'] = ticket_flight.find_elements(:css, + '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") ticket_flight_lists.push(flight_data) end temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists @@ -105,7 +106,7 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ Time.now.strftime("%Y-%m-%d %H:%M:%S"), total_ticket_out_found, total_ticket_in_found ] - DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ) + DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ticket_summary_data) ticket_summary_id = DB.last_insert_row_id() # Save all available out/departure tickets comapny and comapnies flights data @@ -119,7 +120,7 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ tickets_out[:number_of_ticket_found], 'out' ] - DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ) + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", company_data) # Save ticket flights information ticket_out_company_id = DB.last_insert_row_id() From 65129c69772e4fedf789f4b4dbd939b70af14a28 Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 18:45:08 +0600 Subject: [PATCH 14/17] Migration file some comments lin added --- migration.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/migration.rb b/migration.rb index cbf5a21..cc090b3 100644 --- a/migration.rb +++ b/migration.rb @@ -2,6 +2,8 @@ db = SQLite3::Database.open "db_tour_scraper.db" +puts "Migration started ...." + db.execute < Date: Wed, 15 Dec 2021 18:47:21 +0600 Subject: [PATCH 15/17] Migration file some comments line added --- migration.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/migration.rb b/migration.rb index cc090b3..02931d0 100644 --- a/migration.rb +++ b/migration.rb @@ -42,5 +42,6 @@ FOREIGN KEY(ticket_airline_id) REFERENCES tickets_airlines(id) ); SQL + puts "Table airline_flights created" -puts "Migration ended." \ No newline at end of file +puts "Migration ended." From 6d04c9c08dcf6473e563f0ca8175f1f6c8a9b14a Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 18:48:46 +0600 Subject: [PATCH 16/17] Migration file some empty line added --- migration.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/migration.rb b/migration.rb index 02931d0..76f5070 100644 --- a/migration.rb +++ b/migration.rb @@ -44,4 +44,5 @@ SQL puts "Table airline_flights created" + puts "Migration ended." From 46e8053a8c63885a010bf7024593401d11d5af70 Mon Sep 17 00:00:00 2001 From: Rasel Date: Wed, 15 Dec 2021 19:30:22 +0600 Subject: [PATCH 17/17] db column spelling and realted code updated --- migration.rb | 2 +- tour_site_scraper.rb | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/migration.rb b/migration.rb index 76f5070..03abf6c 100644 --- a/migration.rb +++ b/migration.rb @@ -37,7 +37,7 @@ ticket_airline_id INTEGER, flight_code VARCHAR(50), flight_price INTEGER, - flight_changable_status VARCHAR(50), + flight_changeable_status VARCHAR(50), flight_ticket_type VARCHAR(100), FOREIGN KEY(ticket_airline_id) REFERENCES tickets_airlines(id) ); diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 2861dc5..facb98a 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -75,7 +75,7 @@ def searching_ticket_type(ticket_details_type) flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") - flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") + flight_data['flight_changeable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") ticket_flight_lists.push(flight_data) @@ -86,7 +86,6 @@ def searching_ticket_type(ticket_details_type) return all_tickets_details_lists, total_ticket_found end - # Save tickets scraped data to database SQLite into different tables def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date) all_ticket_out_lists = tickets_out_lists[0] @@ -130,7 +129,7 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ ticket_out_company_id, flight['flight_code'], flight['flight_price'], - flight['flight_changable_status'], + flight['flight_changeable_status'], flight['flight_type'] ] DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data) @@ -156,7 +155,7 @@ def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_ ticket_in_company_id, flight['flight_code'], flight['flight_price'], - flight['flight_changable_status'], + flight['flight_changeable_status'], flight['flight_type'] ] DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data)