-
Notifications
You must be signed in to change notification settings - Fork 0
/
quotes_spider_arrow.py
83 lines (72 loc) · 4.98 KB
/
quotes_spider_arrow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import scrapy
import ipdb
import os
import time
import re
import csv
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
## Pending work, they have an API, they don't allow scraping, so we need to use the API to get the data, and then we need to use selenium to get the data from the product page, and then we need to use selenium to get the data from the search results page
class QuotesSpider(scrapy.Spider):
name = "arrow"
start_urls = [
]
def __init__(self, partnumber, *args, **kwargs):
self.start_urls = ['https://www.arrow.com']
self.part_number = partnumber
fireFoxOptions = Options()
fireFoxOptions.add_argument("--headless")
fireFoxOptions.add_argument("--window-size=1920,1080")
fireFoxOptions.add_argument('--start-maximized')
fireFoxOptions.add_argument('--disable-gpu')
fireFoxOptions.add_argument('--no-sandbox')
self.driver = webdriver.Firefox(options=fireFoxOptions, executable_path=r'../../geckodriver')
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, cookies= {':authority': 'www.arrow.com', ':method': 'GET', ':path':'/','accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','accept-encoding':'gzip, deflate, br','accept-language':'en-US,en;q=0.9','cache-control':'max-age=0'},callback=self.parse_search_result)
#the function checks if it is an exact match or not, if it is, it means that it is the only result, and it redirected to the product page, so we fetch the data from there, otherwise, we fetch the data from the search results page by selecting the exact match and entering the webpage, if there is no exact match, we return a dictionary with default values
def parse_search_result(self, response):
ipdb.set_trace()
self.driver.get(response.url)
search_url = 'https://www.arrow.com/en/products/search?cat=&q=%s&r=true' % self.part_number
ipdb.set_trace()
yield scrapy.Request(url=search_url, callback=self.parse_product_page)
##arrow,allie,mouser are the same?
def parse_product_page(self, response):
ipdb.set_trace()
# self.driver.get(response.url)
# data = {}
# exactMatch = False
# if response.url == 'https://www.arrow.com/en/products/search?cat=&q=%s&r=true' % self.part_number:
# ipdb.set_trace()
# exactMatch = False
# else:
# ipdb.set_trace()
# exactMatch = True
# if exactMatch:
# ipdb.set_trace()
# Inventory = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-title BuyingOptions-title--underlined ng-star-inserted')]").text
# sku = self.driver.find_element(By.XPATH, "//div[@name='product-details']//section//section//div//div//div//div//div//div//div//div//span[contains(text(),'%s')]" % self.part_number)
# price = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-priceTiers-price ng-star-inserted')]").text
# quantity = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-priceTiers-quantity ng-star-inserted')]").text
# data = {'Distributor': 'arrow', 'SKU': sku, 'Inventory': Inventory, 'quantity': quantity, 'price': price, 'ClickUrl': response.url, 'currency': 'USD'}
# return data
# else:
# ipdb.set_trace()
# links = self.driver.find_element(By.XPATH, "//*[contains(@data-part-name, '%s')]" % self.part_number)
# if links:
# yield scrapy.Request(url=links.get_attribute('href'), callback=self.parse_product_page)
# else:
# data = {'Distributor': 'arrow', 'SKU': 'no match', 'Inventory': '0', 'quantity': '0', 'price': '0', 'ClickUrl': 'no match', 'currency': 'USD'}
# return data
# def parse_product_page(self, response):
# ipdb.set_trace()
# self.driver.get(response.url)
# Inventory = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-title BuyingOptions-title--underlined ng-star-inserted')]").text
# sku = self.driver.find_element(By.XPATH, "//div[@name='product-details']//section//section//div//div//div//div//div//div//div//div//span[contains(text(),'%s')]" % self.part_number)
# price = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-priceTiers-price ng-star-inserted')]").text
# quantity = self.driver.find_element(By.XPATH, "//*[contains(@class, 'BuyingOptions-priceTiers-quantity ng-star-inserted')]").text
# data = {'Distributor': 'arrow', 'SKU': sku, 'Inventory': Inventory, 'quantity': quantity, 'price': price, 'ClickUrl': response.url, 'currency': 'USD'}
# return data