Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scraping tickets details and all flights information for between two dates, related code updated #6

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
136 changes: 77 additions & 59 deletions tour_site_scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,91 +2,109 @@
require 'selenium-webdriver'
require 'pry'

MAX_RETRY = 100
WAIT = Selenium::WebDriver::Wait.new(timeout: 20)
MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds
MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown

# Put Ticket Search input dates here
TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31)
TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31)

WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html
WEB_DRIVER = Selenium::WebDriver.for :firefox

# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless'])
# driver = Selenium::WebDriver.for(:firefox, options: options)
driver = Selenium::WebDriver.for :firefox

puts 'Trying to fetch data from site.....'
puts '--------------------------------------------------------'

def check_return_tickets_visibility(driver)
begin
# Wait for few seconds until able to find return tickets list
WAIT.until { driver.find_element(css: "#Act_response_in .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_in .airline-name").displayed? }
rescue Exception
end
end

ticket_search_date_from = Date.new(2021, 12, 31)
ticket_search_date_to = Date.new(2022, 01, 31)
ticket_search_date_from.upto(ticket_search_date_to) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts "\n\nTickets for this date " + dt.to_s

def start_scraping(departure_date_in, departure_date_out)
# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement
driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0'
sleep(1) # Wait 1s to load the page properly
WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" +
"date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" +
"CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0"
sleep(1)
begin
retries ||= 0
ticket_summary_button_out = nil
ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In')
ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In')
return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil?
ticket_summary_button_out.click
ticket_summary_button_in.click

WAIT.until { driver.find_element(css: "#Act_response_out .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_out .airline-name").displayed? }
rescue Exception => e
puts 'Trying to fetch data.. ' + retries.to_s
retries += 1
sleep(1) # Wait 1s to load the page properly
retry if (retries <= MAX_RETRY)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

check_return_tickets_visibility(driver)

# Scrap Available Tickets Elements
ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name')
ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company')

# Parse elements to find each companies available ticket and sum
total_available_ticket = 0
ticket_available_lists&.each do |ticket_count|
total_available_ticket += ticket_count.text.delete('^0-9').to_i
def searching_ticket_type(ticket_details_type)
if ticket_details_type == 'in'
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box')
else
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box')
end

# Scrap Returning Tickets Elements
ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name')
ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company')

# Parse elements to find each companies returning tickets and sum
total_available_ticket_in = 0
ticket_available_lists_in&.each do |ticket_count_in|
total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i
total_ticket_found = 0
all_tickets_details_lists = []
ticket_airlines&.each do |ticket_airline|
temp_ticket_airline_info = {}
number_of_ticket_found = 0

ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text
number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i
total_ticket_found += number_of_ticket_found
ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text

temp_ticket_airline_info[:ticket_company_name] = ticket_company_name
temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price
temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found

ticket_flight_lists = []
ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list')
ticket_airline_flights_lists&.each do |ticket_flight|
flight_data = {}
flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML")
flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML")
flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML")
flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML")
flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML")
ticket_flight_lists.push(flight_data)
end
temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists
all_tickets_details_lists.push(temp_ticket_airline_info)
end
return all_tickets_details_lists, total_ticket_found
end

# Write all tickets search results
puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s
puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s
TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts 'Available ticket IN companies name : '
puts '------------------------------------'
ticket_summary_in&.each do |ticket_cmpany_in|
puts ticket_cmpany_in.text.to_s + ', '
end
puts "\n\nTickets for this date " + dt.to_s

puts
puts 'Available ticket OUT companies name : '
puts '-------------------------------------'
ticket_summary&.each do |ticket_cmpany|
puts ticket_cmpany.text.to_s + ', '
begin
retries ||= 0
start_scraping(departure_date_in, departure_date_out)
# Wait for few seconds until able to find return tickets list
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? }
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? }
tickets_out_lists = searching_ticket_type('out')
tickets_in_lists = searching_ticket_type('in')
rescue Exception
retries += 1
retry if (retries <= MAX_CALL)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

all_ticket_out_lists = tickets_out_lists[0]
total_ticket_out_found = tickets_out_lists[1]
all_ticket_in_details = tickets_in_lists[0]
total_ticket_in_found = tickets_in_lists[1]

puts "Total tickets found for out is = " + total_ticket_out_found.to_s
puts "Total tickets found for in is = " + total_ticket_in_found.to_s
end