Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature implement database #7

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ gem 'watir', '~> 6.19', '>= 6.19.1'
gem 'webdrivers', '~> 4.6'
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
gem 'geckodriver-helper', '~> 0.0.3'
gem 'sqlite3'
6 changes: 6 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ GEM
geckodriver-helper (0.0.5)
archive-zip (~> 0.7)
io-like (0.3.1)
mini_portile2 (2.6.1)
nokogiri (1.12.5)
mini_portile2 (~> 2.6.1)
racc (~> 1.4)
nokogiri (1.12.5-x86_64-linux)
racc (~> 1.4)
racc (1.6.0)
Expand All @@ -17,6 +21,7 @@ GEM
childprocess (>= 0.5, < 5.0)
rexml (~> 3.2, >= 3.2.5)
rubyzip (>= 1.2.2)
sqlite3 (1.4.2)
watir (6.19.1)
regexp_parser (>= 1.2, < 3)
selenium-webdriver (>= 3.142.7)
Expand All @@ -32,6 +37,7 @@ PLATFORMS
DEPENDENCIES
geckodriver-helper (~> 0.0.3)
nokogiri (~> 1.11, >= 1.11.7)
sqlite3
watir (~> 6.19, >= 6.19.1)
webdrivers (~> 4.6)

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# rubysite scrapping process

Run `bundle install`
Run `ruby migration.rb`
Run `ruby tour_site_scraper.rb`

Binary file added db_tour_scraper.db
Binary file not shown.
48 changes: 48 additions & 0 deletions migration.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
require 'sqlite3'

db = SQLite3::Database.open "db_tour_scraper.db"

puts "Migration started ...."

db.execute <<SQL
CREATE TABLE IF NOT EXISTS tickets_summary (
id INTEGER PRIMARY KEY AUTOINCREMENT,
departure_date Date,
return_date Date,
time_from_out VARCHAR(20),
time_to_out VARCHAR(20),
search_time DateTime,
total_tickets_out INTEGER,
total_tickets_in INTEGER
);
SQL
puts "Table tickets_summary created"

db.execute <<SQL
CREATE TABLE IF NOT EXISTS tickets_airline_companies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticket_summary_id INTEGER,
airline_company_name VARCHAR(100),
ticket_lowest_price FLOAT,
total_flights_available INTEGER,
ticket_type VARCHAR(10),
FOREIGN KEY(ticket_summary_id) REFERENCES tickets_summary(id)
);
SQL
puts "Table tickets_airline_companies created"

db.execute <<SQL
CREATE TABLE IF NOT EXISTS airline_flights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticket_airline_id INTEGER,
flight_code VARCHAR(50),
flight_price INTEGER,
flight_changeable_status VARCHAR(50),
flight_ticket_type VARCHAR(100),
FOREIGN KEY(ticket_airline_id) REFERENCES tickets_airlines(id)
);
SQL

puts "Table airline_flights created"

puts "Migration ended."
211 changes: 154 additions & 57 deletions tour_site_scraper.rb
Original file line number Diff line number Diff line change
@@ -1,92 +1,189 @@
require 'rubygems'
require 'selenium-webdriver'
require 'pry'
require 'sqlite3'

MAX_RETRY = 100
WAIT = Selenium::WebDriver::Wait.new(timeout: 20)
DB = SQLite3::Database.new("db_tour_scraper.db")

MAX_RETRY = 40 # Maximum retry until the serarch page load in seconds
MAX_CALL = 3 # Maximum recall air ticket site if any ajax error or busy page shown

# Put Ticket Search input dates here
TICKET_SEARCH_FROM_DATE = Date.new(2021, 12, 31)
TICKET_SEARCH_TO_DATE = Date.new(2022, 01, 31)
TIME_FROM_OUT = '0600'
TIME_TO_OUT = '0700'

WAIT = Selenium::WebDriver::Wait.new(timeout: 20) # Maximum wait to find out search results html
WEB_DRIVER = Selenium::WebDriver.for :firefox

# options = Selenium::WebDriver::Firefox::Options.new(args: ['-headless'])
# driver = Selenium::WebDriver.for(:firefox, options: options)
driver = Selenium::WebDriver.for :firefox

puts 'Trying to fetch data from site.....'
puts '--------------------------------------------------------'

def check_return_tickets_visibility(driver)
begin
# Wait for few seconds until able to find return tickets list
WAIT.until { driver.find_element(css: "#Act_response_in .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_in .airline-name").displayed? }
rescue Exception
end
end

ticket_search_date_from = Date.new(2021, 12, 31)
ticket_search_date_to = Date.new(2022, 01, 31)
ticket_search_date_from.upto(ticket_search_date_to) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts "\n\nTickets for this date " + dt.to_s

def start_scraping(departure_date_in, departure_date_out)
# Generate the search url physically using any date, time and put here, we will make it dynamic later based on requirement
driver.navigate.to 'https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&date_in=' + departure_date_in + '&date_out=' + departure_date_out + '&dpt_in=CTS&dpt_out=TYO&time_from_out=0600&time_to_out=0700&time_type_out=0'
sleep(1) # Wait 1s to load the page properly
WEB_DRIVER.navigate.to "https://www.tour.ne.jp/j_air/list/?adult=1&arr_in=TYO&arr_out=CTS&change_date_in=0&change_date_out=0&" +
"date_in=#{departure_date_in}&date_out=#{departure_date_out}&dpt_in=" +
"CTS&dpt_out=TYO&time_from_out=#{TIME_FROM_OUT}&time_to_out=#{TIME_TO_OUT}&time_type_out=0"
sleep(1)
begin
retries ||= 0
ticket_summary_button_out = nil
ticket_summary_button_out = driver.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = driver.find_element(:css, '#Act_Airline_In')
ticket_summary_button_out = WEB_DRIVER.find_element(:css, '#Act_Airline_Out')
ticket_summary_button_in = WEB_DRIVER.find_element(:css, '#Act_Airline_In')
return if ticket_summary_button_out.nil? && ticket_summary_button_in.nil?
ticket_summary_button_out.click
ticket_summary_button_in.click

WAIT.until { driver.find_element(css: "#Act_response_out .toggle-btn-company").displayed? }
WAIT.until { driver.find_element(css: "#Act_response_out .airline-name").displayed? }
rescue Exception => e
puts 'Trying to fetch data.. ' + retries.to_s
retries += 1
sleep(1) # Wait 1s to load the page properly
retry if (retries <= MAX_RETRY)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

check_return_tickets_visibility(driver)

# Scrap Available Tickets Elements
ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name')
ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company')
def searching_ticket_type(ticket_details_type)
if ticket_details_type == 'in'
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box')
else
ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box')
end

# Parse elements to find each companies available ticket and sum
total_available_ticket = 0
ticket_available_lists&.each do |ticket_count|
total_available_ticket += ticket_count.text.delete('^0-9').to_i
total_ticket_found = 0
all_tickets_details_lists = []
ticket_airlines&.each do |ticket_airline|
temp_ticket_airline_info = {}
number_of_ticket_found = 0

ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text
number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i
total_ticket_found += number_of_ticket_found
ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text

temp_ticket_airline_info[:ticket_company_name] = ticket_company_name
temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price
temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found

ticket_flight_lists = []
ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list')
ticket_airline_flights_lists&.each do |ticket_flight|
flight_data = {}
flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML")
flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML")
flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML")
flight_data['flight_changeable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML")
flight_data['flight_type'] = ticket_flight.find_elements(:css,
'.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML")
ticket_flight_lists.push(flight_data)
end
temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists
all_tickets_details_lists.push(temp_ticket_airline_info)
end
return all_tickets_details_lists, total_ticket_found
end

# Scrap Returning Tickets Elements
ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name')
ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company')
# Save tickets scraped data to database SQLite into different tables
def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date)
all_ticket_out_lists = tickets_out_lists[0]
all_ticket_in_lists = tickets_in_lists[0]
total_ticket_out_found = tickets_out_lists[1]
total_ticket_in_found = tickets_in_lists[1]
puts "Total tickets found for out is = " + total_ticket_out_found.to_s
puts "Total tickets found for in is = " + total_ticket_in_found.to_s

# Save ticket summary
ticket_summary_data = [
nil,
departure_date.to_s,
return_date.to_s,
TIME_FROM_OUT,
TIME_TO_OUT,
Time.now.strftime("%Y-%m-%d %H:%M:%S"),
total_ticket_out_found, total_ticket_in_found
]
DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ticket_summary_data)
ticket_summary_id = DB.last_insert_row_id()

# Save all available out/departure tickets comapny and comapnies flights data
all_ticket_out_lists.each do |tickets_out|
# Save company tickets informations
company_data = [
nil,
ticket_summary_id,
tickets_out[:ticket_company_name],
tickets_out[:ticket_minimum_price],
tickets_out[:number_of_ticket_found],
'out'
]
DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", company_data)

# Save ticket flights information
ticket_out_company_id = DB.last_insert_row_id()
tickets_out[:ticket_flight_lists].each do |flight|
flight_data = [
nil,
ticket_out_company_id,
flight['flight_code'],
flight['flight_price'],
flight['flight_changeable_status'],
flight['flight_type']
]
DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data)
end
end

# Parse elements to find each companies returning tickets and sum
total_available_ticket_in = 0
ticket_available_lists_in&.each do |ticket_count_in|
total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i
# Save all available in/return tickets comapny and comapnies flights data
all_ticket_in_lists.each do |tickets_in|
# Save company tickets informations
ticket_in_company_data = [
nil,
ticket_summary_id,
tickets_in[:ticket_company_name],
tickets_in[:ticket_minimum_price],
tickets_in[:number_of_ticket_found],
'in'
]
DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ticket_in_company_data)
ticket_in_company_id = DB.last_insert_row_id()
tickets_in[:ticket_flight_lists].each do |flight|
flight_data = [
nil,
ticket_in_company_id,
flight['flight_code'],
flight['flight_price'],
flight['flight_changeable_status'],
flight['flight_type']
]
DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data)
end
end
rows = DB.execute( "select * from tickets_summary" )
end

# Write all tickets search results
puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s
puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s
TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt|
departure_date_in = dt.to_s.delete("-")
departure_date_out = dt.to_s.delete("-")

puts 'Available ticket IN companies name : '
puts '------------------------------------'
ticket_summary_in&.each do |ticket_cmpany_in|
puts ticket_cmpany_in.text.to_s + ', '
end
puts "\n\nTickets for this date " + dt.to_s

puts
puts 'Available ticket OUT companies name : '
puts '-------------------------------------'
ticket_summary&.each do |ticket_cmpany|
puts ticket_cmpany.text.to_s + ', '
begin
retries ||= 0
start_scraping(departure_date_in, departure_date_out)
# Wait for few seconds until able to find return tickets list
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? }
WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? }
tickets_out_lists = searching_ticket_type('out')
tickets_in_lists = searching_ticket_type('in')
rescue Exception
retries += 1
retry if (retries <= MAX_CALL)
raise "Could not get ticket website information: Please give necessary information to search"
end
end

# Save scraped ticket details, initially departure date and return date is same
save_scrap_data(tickets_out_lists, tickets_in_lists, dt, dt)
end