diff --git a/Gemfile b/Gemfile index 9004097..59faf60 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,4 @@ gem 'watir', '~> 6.19', '>= 6.19.1' gem 'webdrivers', '~> 4.6' gem 'nokogiri', '~> 1.11', '>= 1.11.7' gem 'geckodriver-helper', '~> 0.0.3' +gem 'sqlite3' diff --git a/Gemfile.lock b/Gemfile.lock index 8a88c71..15cda8e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -7,6 +7,10 @@ GEM geckodriver-helper (0.0.5) archive-zip (~> 0.7) io-like (0.3.1) + mini_portile2 (2.6.1) + nokogiri (1.12.5) + mini_portile2 (~> 2.6.1) + racc (~> 1.4) nokogiri (1.12.5-x86_64-linux) racc (~> 1.4) racc (1.6.0) @@ -17,6 +21,7 @@ GEM childprocess (>= 0.5, < 5.0) rexml (~> 3.2, >= 3.2.5) rubyzip (>= 1.2.2) + sqlite3 (1.4.2) watir (6.19.1) regexp_parser (>= 1.2, < 3) selenium-webdriver (>= 3.142.7) @@ -32,6 +37,7 @@ PLATFORMS DEPENDENCIES geckodriver-helper (~> 0.0.3) nokogiri (~> 1.11, >= 1.11.7) + sqlite3 watir (~> 6.19, >= 6.19.1) webdrivers (~> 4.6) diff --git a/README.md b/README.md index 9f6e735..b86eeaa 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ # rubysite scrapping process Run `bundle install` +Run `ruby migration.rb` Run `ruby tour_site_scraper.rb` + diff --git a/db_tour_scraper.db b/db_tour_scraper.db new file mode 100644 index 0000000..e8ab414 Binary files /dev/null and b/db_tour_scraper.db differ diff --git a/migration.rb b/migration.rb new file mode 100644 index 0000000..03abf6c --- /dev/null +++ b/migration.rb @@ -0,0 +1,48 @@ +require 'sqlite3' + +db = SQLite3::Database.open "db_tour_scraper.db" + +puts "Migration started ...." + +db.execute < e puts 'Trying to fetch data.. ' + retries.to_s retries += 1 @@ -50,43 +44,146 @@ def check_return_tickets_visibility(driver) retry if (retries <= MAX_RETRY) raise "Could not get ticket website information: Please give necessary information to search" end +end - check_return_tickets_visibility(driver) - - # Scrap Available Tickets Elements - ticket_summary = driver.find_elements(:css, '#Act_response_out .airline-name') - ticket_available_lists = driver.find_elements(:css, '#Act_response_out .toggle-btn-company') +def searching_ticket_type(ticket_details_type) + if ticket_details_type == 'in' + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_in .company-list .company-box') + else + ticket_airlines = WEB_DRIVER.find_elements(:css, '#Act_response_out .company-list .company-box') + end - # Parse elements to find each companies available ticket and sum - total_available_ticket = 0 - ticket_available_lists&.each do |ticket_count| - total_available_ticket += ticket_count.text.delete('^0-9').to_i + total_ticket_found = 0 + all_tickets_details_lists = [] + ticket_airlines&.each do |ticket_airline| + temp_ticket_airline_info = {} + number_of_ticket_found = 0 + + ticket_company_name = ticket_airline.find_element(:css, '.airline-name').text + number_of_ticket_found = ticket_airline.find_element(:css, '.toggle-btn-company').text.delete('^0-9').to_i + total_ticket_found += number_of_ticket_found + ticket_minimum_price = ticket_airline.find_element(:css, '.hdg-sup-price > b').text + + temp_ticket_airline_info[:ticket_company_name] = ticket_company_name + temp_ticket_airline_info[:ticket_minimum_price] = ticket_minimum_price + temp_ticket_airline_info[:number_of_ticket_found] = number_of_ticket_found + + ticket_flight_lists = [] + ticket_airline_flights_lists = ticket_airline.find_elements(:css, '.Act_flight_list') + ticket_airline_flights_lists&.each do |ticket_flight| + flight_data = {} + flight_data['flight_code'] = ticket_flight.find_elements(:css, '.ticket-summary-row > span')[1].attribute("innerHTML") + flight_data['flight_price'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-price > label > b')[0].attribute("innerHTML") + flight_data['flight_seat'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-seat')[0].attribute("innerHTML") + flight_data['flight_changeable_status'] = ticket_flight.find_elements(:css, '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-icon .icon-date')[0].attribute("innerHTML") + flight_data['flight_type'] = ticket_flight.find_elements(:css, + '.ticket-detail-item .ticket-detail-item-inner .ticket-detail-type .ticket-detail-type-text .ticket-detail-type-text-ellipsis')[0].attribute("innerHTML") + ticket_flight_lists.push(flight_data) + end + temp_ticket_airline_info[:ticket_flight_lists] = ticket_flight_lists + all_tickets_details_lists.push(temp_ticket_airline_info) end + return all_tickets_details_lists, total_ticket_found +end - # Scrap Returning Tickets Elements - ticket_summary_in = driver.find_elements(:css, '#Act_response_in .airline-name') - ticket_available_lists_in = driver.find_elements(:css, '#Act_response_in .toggle-btn-company') +# Save tickets scraped data to database SQLite into different tables +def save_scrap_data(tickets_out_lists, tickets_in_lists, departure_date, return_date) + all_ticket_out_lists = tickets_out_lists[0] + all_ticket_in_lists = tickets_in_lists[0] + total_ticket_out_found = tickets_out_lists[1] + total_ticket_in_found = tickets_in_lists[1] + puts "Total tickets found for out is = " + total_ticket_out_found.to_s + puts "Total tickets found for in is = " + total_ticket_in_found.to_s + + # Save ticket summary + ticket_summary_data = [ + nil, + departure_date.to_s, + return_date.to_s, + TIME_FROM_OUT, + TIME_TO_OUT, + Time.now.strftime("%Y-%m-%d %H:%M:%S"), + total_ticket_out_found, total_ticket_in_found + ] + DB.execute("INSERT INTO tickets_summary values(?, ?, ?, ?, ?, ?, ?, ? )", ticket_summary_data) + ticket_summary_id = DB.last_insert_row_id() + + # Save all available out/departure tickets comapny and comapnies flights data + all_ticket_out_lists.each do |tickets_out| + # Save company tickets informations + company_data = [ + nil, + ticket_summary_id, + tickets_out[:ticket_company_name], + tickets_out[:ticket_minimum_price], + tickets_out[:number_of_ticket_found], + 'out' + ] + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", company_data) + + # Save ticket flights information + ticket_out_company_id = DB.last_insert_row_id() + tickets_out[:ticket_flight_lists].each do |flight| + flight_data = [ + nil, + ticket_out_company_id, + flight['flight_code'], + flight['flight_price'], + flight['flight_changeable_status'], + flight['flight_type'] + ] + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data) + end + end - # Parse elements to find each companies returning tickets and sum - total_available_ticket_in = 0 - ticket_available_lists_in&.each do |ticket_count_in| - total_available_ticket_in += ticket_count_in.text.delete('^0-9').to_i + # Save all available in/return tickets comapny and comapnies flights data + all_ticket_in_lists.each do |tickets_in| + # Save company tickets informations + ticket_in_company_data = [ + nil, + ticket_summary_id, + tickets_in[:ticket_company_name], + tickets_in[:ticket_minimum_price], + tickets_in[:number_of_ticket_found], + 'in' + ] + DB.execute("INSERT INTO tickets_airline_companies values(?, ?, ?, ?, ?, ?)", ticket_in_company_data) + ticket_in_company_id = DB.last_insert_row_id() + tickets_in[:ticket_flight_lists].each do |flight| + flight_data = [ + nil, + ticket_in_company_id, + flight['flight_code'], + flight['flight_price'], + flight['flight_changeable_status'], + flight['flight_type'] + ] + DB.execute("INSERT INTO airline_flights values(?, ?, ?, ?, ?, ?)", flight_data) + end end + rows = DB.execute( "select * from tickets_summary" ) +end - # Write all tickets search results - puts 'Total available ticket OUT found is = ' + total_available_ticket.to_s - puts 'Total available ticket IN found is = ' + total_available_ticket_in.to_s +TICKET_SEARCH_FROM_DATE.upto(TICKET_SEARCH_TO_DATE) do |dt| + departure_date_in = dt.to_s.delete("-") + departure_date_out = dt.to_s.delete("-") - puts 'Available ticket IN companies name : ' - puts '------------------------------------' - ticket_summary_in&.each do |ticket_cmpany_in| - puts ticket_cmpany_in.text.to_s + ', ' - end + puts "\n\nTickets for this date " + dt.to_s - puts - puts 'Available ticket OUT companies name : ' - puts '-------------------------------------' - ticket_summary&.each do |ticket_cmpany| - puts ticket_cmpany.text.to_s + ', ' + begin + retries ||= 0 + start_scraping(departure_date_in, departure_date_out) + # Wait for few seconds until able to find return tickets list + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_out .company-list").displayed? } + WAIT.until { WEB_DRIVER.find_element(css: "#Act_response_in .company-list").displayed? } + tickets_out_lists = searching_ticket_type('out') + tickets_in_lists = searching_ticket_type('in') + rescue Exception + retries += 1 + retry if (retries <= MAX_CALL) + raise "Could not get ticket website information: Please give necessary information to search" end -end + + # Save scraped ticket details, initially departure date and return date is same + save_scrap_data(tickets_out_lists, tickets_in_lists, dt, dt) +end \ No newline at end of file