Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scraping tickets details and all flights information for between two dates, related code updated #6

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
127 changes: 92 additions & 35 deletions tour_site_scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,109 @@
require 'selenium-webdriver'
require 'pry'

# This options are for headless execution of the browser so that it don't need to load browser
MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds
MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown

# Put Ticket Search input dates here
TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31)
TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31)

WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html
WEB_DRIVER = Selenium::WebDriver.for :firefox

# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless'])
# driver = Selenium::WebDriver.for(:firefox, options: options)

driver = Selenium::WebDriver.for :firefox
puts 'Trying to fetch data from site.....'
puts '--------------------------------------------------------'

# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement
driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&air_type=2&arr_out=ITM&change_date_in=0&change_date_out=0&date_out=20211231&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0'
sleep(1) # Wait 1s to load the page properly

MAX_RETRY = 100
begin
retries ||= 0
ticket_summary_button = driver.find_element(:css, '#Act_Airline_Out')
return if ticket_summary_button.nil?
ticket_summary_button.click
rescue Exception => e
puts 'Trying to fetch data.. ' + retries.to_s
retries += 1
sleep(1) # Wait 1s to load the page properly
retry if (retries <= MAX_RETRY)
raise "Could not get ticket website information: Please give necessary information to search"
end

# Take some time after click to load ajax content until search element can be found
loop do
def start_scraping(departure_date_in, departure_date_out)
# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement
WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" +
"date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" +
"CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0"
sleep(1)
if !driver.find_elements(:class, 'airline-name').nil?
break
begin
retries ||= 0
ticket_summary_button_out = nil
ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In')
return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil?
ticket_summary_button_out.click
ticket_summary_button_in.click
rescue Exception => e
puts 'Trying to fetch data.. ' + retries.to_s
retries += 1
sleep(1) # Wait 1s to load the page properly
retry if (retries <= MAX_RETRY)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

# Find available information and available ticket list elements
ticket_summary = driver.find_elements(:class, 'airline-name')
ticket_available_lists = driver.find_elements(:class, 'toggle-btn-company')
def searching_ticket_type(ticket_details_type)
if ticket_details_type == 'in'
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box')
else
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box')
end

total_ticket_found = 0
all_tickets_details_lists = []
ticket_airlines&.each do |ticket_airline|
temp_ticket_airline_info = {}
number_of_ticket_found = 0

ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text
number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i
total_ticket_found += number_of_ticket_found
ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text

# Parse elements to find each companies available ticket and sum to get total available tickets
total_available_ticket = 0
!ticket_available_lists.nil? && ticket_available_lists.each do |ticket_count|
total_available_ticket += ticket_count.text.delete('^0-9').to_i
temp_ticket_airline_info[:ticket_company_name] = ticket_company_name
temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price
temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found

ticket_flight_lists = []
ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list')
ticket_airline_flights_lists&.each do |ticket_flight|
flight_data = {}
flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML")
flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML")
flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML")
flight_data['flight_changable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML")
flight_data['flight_type'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML")
ticket_flight_lists.push(flight_data)
end
temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists
all_tickets_details_lists.push(temp_ticket_airline_info)
end
return all_tickets_details_lists, total_ticket_found
end

puts 'Available ticket companies name = '
!ticket_summary.nil? && ticket_summary.each do |ticket_cmpany|
puts ticket_cmpany.text.to_s + ', '
TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts "\n\nTickets for this date " + dt.to_s

begin
retries ||= 0
start_scraping(departure_date_in, departure_date_out)
# Wait for few seconds until able to find return tickets list
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? }
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? }
tickets_out_lists = searching_ticket_type('out')
tickets_in_lists = searching_ticket_type('in')
rescue Exception
retries += 1
retry if (retries <= MAX_CALL)
raise "Could not get ticket website information: Please give necessary information to search"
end

all_ticket_out_lists = tickets_out_lists[0]
total_ticket_out_found = tickets_out_lists[1]
all_ticket_in_details = tickets_in_lists[0]
total_ticket_in_found = tickets_in_lists[1]

puts "Total tickets found for out is = " + total_ticket_out_found.to_s
puts "Total tickets found for in is = " + total_ticket_in_found.to_s
end
puts 'Total available ticket found is = ' + total_available_ticket.to_s