diff --git a/tour_site_scraper.rb b/tour_site_scraper.rb index 0371077..aa02b24 100644 --- a/tour_site_scraper.rb +++ b/tour_site_scraper.rb @@ -2,47 +2,36 @@ require 'selenium-webdriver' require 'pry' -MAX_RETRY = 100 -WAIT = Selenium::WebDriver::Wait.new(timeout: 20) +MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds +MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown + +# Put Ticket Search input dates here +TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31) +TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31) + +WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html +WEB_DRIVER = Selenium::WebDriver.for :firefox # options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless']) # driver = Selenium::WebDriver.for(:firefox, options: options) -driver = Selenium::WebDriver.for :firefox puts 'Trying to fetch data from site.....' puts '--------------------------------------------------------' -def check_return_tickets_visibility(driver) - begin - # Wait for few seconds until able to find return tickets list - WAIT.until { driver.find_element(css: "#Act_response_in .toggle-btn-company").displayed? } - WAIT.until { driver.find_element(css: "#Act_response_in .airline-name").displayed? } - rescue Exception - end -end - -ticket_search_date_from = Date.new(2021, 12, 31) -ticket_search_date_to = Date.new(2022, 01, 31) -ticket_search_date_from.upto(ticket_search_date_to) do |dt| - departure_date_in = dt.to_s.delete("-") - departure_date_out = dt.to_s.delete("-") - - puts "\n\nTickets for this date " + dt.to_s - +def start_scraping(departure_date_in, departure_date_out) # Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement - driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0' - sleep(1) # Wait 1s to load the page properly + WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" + + "date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" + + "CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0" + sleep(1) begin retries ||= 0 ticket_summary_button_out = nil - ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out') - ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In') + ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out') + ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In') return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil? ticket_summary_button_out.click ticket_summary_button_in.click - - WAIT.until { driver.find_element(css: "#Act_response_out .toggle-btn-company").displayed? } - WAIT.until { driver.find_element(css: "#Act_response_out .airline-name").displayed? } rescue Exception => e puts 'Trying to fetch data.. ' + retries.to_s retries += 1 @@ -50,43 +39,72 @@ def check_return_tickets_visibility(driver) retry if (retries <= MAX_RETRY) raise "Could not get ticket website information: Please give necessary information to search" end +end - check_return_tickets_visibility(driver) - - # Scrap Available Tickets Elements - ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name') - ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company') - - # Parse elements to find each companies available ticket and sum - total_available_ticket = 0 - ticket_available_lists&.each do |ticket_count| - total_available_ticket += ticket_count.text.delete('^0-9').to_i +def searching_ticket_type(ticket_details_type) + if ticket_details_type == 'in' + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box') + else + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box') end - # Scrap Returning Tickets Elements - ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name') - ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company') - - # Parse elements to find each companies returning tickets and sum - total_available_ticket_in = 0 - ticket_available_lists_in&.each do |ticket_count_in| - total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i + total_ticket_found = 0 + all_tickets_details_lists = [] + ticket_airlines&.each do |ticket_airline| + temp_ticket_airline_info = {} + number_of_ticket_found = 0 + + ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text + number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i + total_ticket_found += number_of_ticket_found + ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text + + temp_ticket_airline_info[:ticket_company_name] = ticket_company_name + temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price + temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found + + ticket_flight_lists = [] + ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list') + ticket_airline_flights_lists&.each do |ticket_flight| + flight_data = {} + flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") + flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") + flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") + flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") + flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + ticket_flight_lists.push(flight_data) + end + temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists + all_tickets_details_lists.push(temp_ticket_airline_info) end + return all_tickets_details_lists, total_ticket_found +end - # Write all tickets search results - puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s - puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s +TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt| + departure_date_in = dt.to_s.delete("-") + departure_date_out = dt.to_s.delete("-") - puts 'Available ticket IN companies name : ' - puts '------------------------------------' - ticket_summary_in&.each do |ticket_cmpany_in| - puts ticket_cmpany_in.text.to_s + ', ' - end + puts "\n\nTickets for this date " + dt.to_s - puts - puts 'Available ticket OUT companies name : ' - puts '-------------------------------------' - ticket_summary&.each do |ticket_cmpany| - puts ticket_cmpany.text.to_s + ', ' + begin + retries ||= 0 + start_scraping(departure_date_in, departure_date_out) + # Wait for few seconds until able to find return tickets list + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? } + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? } + tickets_out_lists = searching_ticket_type('out') + tickets_in_lists = searching_ticket_type('in') + rescue Exception + retries += 1 + retry if (retries <= MAX_CALL) + raise "Could not get ticket website information: Please give necessary information to search" end -end + + all_ticket_out_lists = tickets_out_lists[0] + total_ticket_out_found = tickets_out_lists[1] + all_ticket_in_details = tickets_in_lists[0] + total_ticket_in_found = tickets_in_lists[1] + + puts "Total tickets found for out is = " + total_ticket_out_found.to_s + puts "Total tickets found for in is = " + total_ticket_in_found.to_s +end \ No newline at end of file