Skip to content

Commit

Permalink
finished added suppliers scrape logic
Browse files Browse the repository at this point in the history
  • Loading branch information
ParzivalRealm committed Feb 21, 2023
1 parent 68b36e5 commit 7227682
Show file tree
Hide file tree
Showing 15 changed files with 243 additions and 55 deletions.
2 changes: 1 addition & 1 deletion config/database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ default: &default
development:
<<: *default
database: crawler_rebuild_development
username: deploy
username: postgres
password: Kingdom1


Expand Down
10 changes: 6 additions & 4 deletions config/puma.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

# start puma with:
# RAILS_ENV=production bundle exec puma -C ./config/puma.rb
# or
# RAILS_ENV=development bundle exec puma -C ./config/puma.rb

application_path = Rails.root
railsenv = 'production'
railsenv = ENV['RAILS_ENV'] || 'development'
directory application_path
environment railsenv
daemonize true
#daemonize true
pidfile "#{application_path}/tmp/pids/puma-#{railsenv}.pid"
state_path "#{application_path}/tmp/pids/puma-#{railsenv}.state"
stdout_redirect
"#{application_path}/log/puma-#{railsenv}.stdout.log",
"#{application_path}/log/puma-#{railsenv}.stdout.log"
"#{application_path}/log/puma-#{railsenv}.stderr.log"
threads 0, 16
bind "unix://#{application_path}/tmp/sockets/#{railsenv}.socket"
bind "unix://#{application_path}/tmp/sockets/#{railsenv}.socket"
28 changes: 27 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

258 changes: 209 additions & 49 deletions lib/scrapper_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@ def call
parsed_data = []

@xlsx.each do |row|
# self.scrape_info("plcity", "https://www.plc-city.com/shop/en/content/search?q=#{row[0]}", 5, row[0]) this is just for testing, delete when done
suppliers_list.each do |supplier| # here sends the supplier to iterate

self.scrape_info("williamsautomations", "https://williamsautomation.com/search?type=product&options%5Bprefix%5D=last&q=#{row[0]}", 4, row[0]) #this is just for testing, delete when done
# suppliers_list.each do |supplier| # here sends the supplier to iterate

base_url = supplier.website
searchpath = supplier.searchpath.gsub('ssacprtno', row[0])
url = base_url + searchpath
if supplier.name == "digikey"
url = base_url
end
self.scrape_info(supplier.name, url, supplier.id, row[0])
end
# base_url = supplier.website
# searchpath = supplier.searchpath.gsub('ssacprtno', row[0])
# url = base_url + searchpath
# if supplier.name == "digikey"
# url = base_url
# end
# self.scrape_info(supplier.name, url, supplier.id, row[0])
# end
end
end

Expand Down Expand Up @@ -80,9 +81,7 @@ def scrape_info(supplier_name, url, supplier_id, part_number)
links = [item.attribute('href')] rescue [item]
xpaths = self.xpaths(supplier_name)

links.each do |link|
driver.get(link)
sleep 1
links.each do |link|williamsautomations
data_value = driver.find_element(:css, ".product_price").text rescue ["0"]
data_value == [] ? data_value = ["0"] : data_value #this means that the product was not found or that the part number is not exact so we just fill the data with "0"and go to next supplier
scrape_info["price"] = driver.find_element(:xpath, xpaths["price"]).text rescue "0"
Expand Down Expand Up @@ -171,16 +170,177 @@ def scrape_info(supplier_name, url, supplier_id, part_number)
# end

when "mouser"
page_info = fetch_page(url)
doc = Nokogiri::HTML(page_info)
body = doc.xpath("//body")

if body.text.include?("Object moved to ")
product_path = doc.css("a").first["href"]
url = "https://www.mouser.com" + product_path
page_info = fetch_page(url)
doc = Nokogiri::HTML(page_info)
info = doc.xpath("//script")
json_unparsed = info.children[6].text
json_parsed = JSON.parse(json_unparsed)
scrape_info["price"] = json_parsed["offers"]["price"]
scrape_info["order_amount"] = 1
scrape_info["inventory"] = json_parsed["offers"]["InventoryLevel"]
scraped_data_instance = ScrapedDatum.new(scrapper_id: @scrapper.id, supplier_id: supplier_id, part_number: part_number, order_amount: scrape_info["order_amount"], inventory: scrape_info["inventory"], price: scrape_info["price"])
scraped_data_instance.save
else
scrape_info["price"] = 0
scrape_info["order_amount"] = 1
scrape_info["inventory"] = 0
scraped_data_instance = ScrapedDatum.new(scrapper_id: @scrapper.id, supplier_id: supplier_id, part_number: part_number, order_amount: scrape_info["order_amount"], inventory: scrape_info["inventory"], price: scrape_info["price"])
scraped_data_instance.save
end
when "mrosupply"

not_found_value = driver.find_element(:xpath, "//*[contains(@class, 'm-primary-box--title')]").text rescue nil
if not_found_value == "Looks like no matches were found. But with a selection of over 1.25 million parts (and growing), we probably have the item you are looking for."
scrape_info["price"] = 0
scrape_info["order_amount"] = 1
scrape_info["inventory"] = 0
scraped_data_instance = ScrapedDatum.new(scrapper_id: @scrapper.id, supplier_id: supplier_id, part_number: part_number, order_amount: scrape_info["order_amount"], inventory: scrape_info["inventory"], price: scrape_info["price"])
scraped_data_instance.save
else
product_page_result = driver.find_element(:xpath, "//*[contains(@class, 'm-catalogue-product-img')]/a")
product_page_link = product_page_result.attribute("href")
driver.navigate.to product_page_link
sleep(1)
price_result = driver.find_element(:xpath, "//*[contains(@class, 'price')]")
scrape_info["price"] = price_result.text.gsub(/[^0-9.]/, '') rescue ["0"]
out_of_stock_indicator = driver.find_element(:xpath, "//*[contains(@class, 'u-warning')]").text rescue nil
if out_of_stock_indicator == "CONFIRM AVAILABILITY"
scrape_info["inventory"] = 0
scrape_info["order_amount"] = 1
scraped_data_instance = ScrapedDatum.new(scrapper_id: @scrapper.id, supplier_id: supplier_id, part_number: part_number, order_amount: scrape_info["order_amount"], inventory: scrape_info["inventory"], price: scrape_info["price"])
else
scrape_info["inventory"] = 1 #it appears that every item has the confirm availability button, so we need to check the inventory status when we find a product that displays differently.
scrape_info["order_amount"] = 1
scraped_data_instance = ScrapedDatum.new(scrapper_id: @scrapper.id, supplier_id: supplier_id, part_number: part_number, order_amount: scrape_info["order_amount"], inventory: scrape_info["inventory"], price: scrape_info["price"])
end
end

when "onlinecomponents"

when "sager"
scrape_info["inventory"] = driver.find_element(:xpath, "//span[contains(@class,'value Instock-availability')]").text rescue 0
price_table = driver.find_element(:xpath, "//div[@id='divPriceListLeft']")
order_quantities = price_table.find_elements(:xpath, "*/div[contains(@class, 'col-4 pr-5 text-graphite-dark pl-0')]")
prices = price_table.find_elements(:xpath, "*/*[@class='col-4 text-right']")
prices.shift
order_quantities.each_with_index do |element, index|
order_amount = element.text.gsub(/[^0-9.]/, '')
price = prices[index].text.gsub(/[^0-9.]/, '')
begin
order_amount_int = Integer(order_amount)
price_float = Float(price)
scrape_info["order_amount"] = order_amount_int
scrape_info["price"] = price_float
begin
scraped_data_instance = ScrapedDatum.new(
scrapper_id: @scrapper.id,
supplier_id: supplier_id,
part_number: part_number,
order_amount: scrape_info["order_amount"],
inventory: scrape_info["inventory"],
price: scrape_info["price"]
)
scraped_data_instance.save
rescue StandardError
puts "Could not save record"
next
end
rescue ArgumentError
puts "Could not convert to integer or float skipping record"
next
end
end

when "tti"

begin
scrape_info["inventory"] = driver.find_element(:xpath, "//div[@class='c-part-detail__availability-column']/div/div[contains(@class, 'u-font')]/span") rescue 0
price_table = driver.find_element(:xpath, "//div[@id='productDetailQuantities']")
rows = price_table.find_elements(:xpath, "*/div[contains(@class, 'c-product-detail__quantity-price')]")
rows.each do |row|

data_container = row.find_element(:xpath, "*/div[@class='row c-part-detail__pricing-container']")
begin
scrape_info["order_amount"] = data_container.find_element(:xpath, "*/div[@class='col-xs-4 c-part-detail__pricing-quantity']").text.gsub(/[^0-9.]/, '')
rescue StandardError
scrape_info["order_amount"] = 0
puts "order_amount not found... setting to 0"
next
end

begin
scrape_info["price"] = data_container.find_element(:xpath, "*/div[@class='col-xs-4 c-part-detail__pricing-extended']").text.gsub(/[^0-9.]/, '')
rescue StandardError
scrape_info["price"] = 0
puts "price not found... setting to 0"
next
end

begin
scraped_data_instance = ScrapedDatum.new(
scrapper_id: @scrapper.id,
supplier_id: supplier_id,
part_number: part_number,
order_amount: scrape_info["order_amount"],
inventory: scrape_info["inventory"],
price: scrape_info["price"]
)
scraped_data_instance.save
rescue StandardError
puts "Could not save record"
next
end
end

rescue StandardError
puts "Could not find inventory or price table"
end
when "williamsautomations"
not_found_indicator = driver.find_element(:xpath, "//*[@class='tc']").text rescue false
not_found_indicator == false ? next : not_found_indicator = not_found_indicator
if not_found_indicator.include?("did not yield any results")
puts "No results found for #{part_number}"
next
else
product_found_img = driver.find_element(:xpath, "//*[@class = 'grid_img_wr']/a")
product_url = product_found_img.attribute("href")
driver.navigate.to product_url
sleep(1)
begin
scrape_info["price"] = driver.find_element(:xpath, "//*[contains(@class, 'money')]").text.gsub(/[^0-9.]/, '')
rescue StandardError
scrape_info["price"] = 0
puts "price not found... setting to 0"
next
end
begin
scrape_info["inventory"] = driver.find_element(:xpath, "//*[contains(@id, 'available-qty')]").text.gsub(/[^0-9.]/, '')
rescue StandardError
scrape_info["inventory"] = 0
puts "inventory not found... setting to 0"
next
end
scrape_info["order_amount"] = 1
begin
scraped_data_instance = ScrapedDatum.new(
scrapper_id: @scrapper.id,
supplier_id: supplier_id,
part_number: part_number,
order_amount: scrape_info["order_amount"],
inventory: scrape_info["inventory"],
price: scrape_info["price"]
)
scraped_data_instance.save
rescue StandardError
puts "Could not save record"
next
end
end

binding.pry
else
#remove this when all suppliers are added
end
Expand Down Expand Up @@ -261,42 +421,42 @@ def xpaths(supplier_name)
# end


# def fetch_page(url)
# uri = URI(url)
def fetch_page(url)
uri = URI(url)

# headers = {
# 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding' => 'gzip, deflate, br',
# 'accept-language' => 'en,en-US;q=0.9,es;q=0.8',
# 'cache-control' => 'no-cache',
# 'pragma' => 'no-cache',
# 'sec-ch-ua' => '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
# 'sec-ch-ua-mobile' => '?0',
# 'sec-ch-ua-platform' => '"Linux"',
# 'sec-fetch-dest' => 'document',
# 'sec-fetch-mode' => 'navigate',
# 'sec-fetch-site' => 'same-origin',
# 'sec-fetch-user' => '?1',
# 'upgrade-insecure-requests' => '1',
# 'user-agent' => 'Mozilla/5."0"(X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0."0"Safari/537.36'
# }
headers = {
'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding' => 'gzip, deflate, br',
'accept-language' => 'en,en-US;q=0.9,es;q=0.8',
'cache-control' => 'no-cache',
'pragma' => 'no-cache',
'sec-ch-ua' => '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile' => '?0',
'sec-ch-ua-platform' => '"Linux"',
'sec-fetch-dest' => 'document',
'sec-fetch-mode' => 'navigate',
'sec-fetch-site' => 'same-origin',
'sec-fetch-user' => '?1',
'upgrade-insecure-requests' => '1',
'user-agent' => 'Mozilla/5."0"(X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0."0"Safari/537.36'
}

# response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
# request = Net::HTTP::Get.new(url)
# headers.each do |key, value|
# request[key] = value
# end
response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
request = Net::HTTP::Get.new(url)
headers.each do |key, value|
request[key] = value
end

# http.request(request)
# end
# if response['content-encoding'] == 'gzip'
# body = Zlib::GzipReader.new(StringIO.new(response.body)).read
# else
# body = response.body
# end
http.request(request)
end
if response['content-encoding'] == 'gzip'
body = Zlib::GzipReader.new(StringIO.new(response.body)).read
else
body = response.body
end

# body
# end
body
end


def parse_and_save(scraped_data)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 7227682

Please sign in to comment.