Adapt to Transfermakt site upstream changes (#35)

dcaribou · Nov 14, 2021 · b6442cc · b6442cc
1 parent a7ef0ed
commit b6442cc
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 12 deletions.
diff --git a/tfmkt/spiders/appearances.py b/tfmkt/spiders/appearances.py
@@ -1,6 +1,5 @@
 from tfmkt.spiders.common import BaseSpider
 from scrapy.shell import inspect_response # required for debugging
-import re
 from inflection import parameterize, underscore
 from urllib.parse import urlparse
 
@@ -31,6 +30,9 @@ def parse_stats(self, response, parent):
     @scrapes assists competition_code date for goals href matchday minutes_played opponent parent pos red_cards result second_yellow_cards type venue yellow_cards
     """
 
+    # inspect_response(response, self)
+    # exit(1)
+
     def parse_stats_table(table):
         """Parses a table of player's statistics."""
         header_elements = [
@@ -56,34 +58,44 @@ def parse_stats_table(table):
 
     def parse_stats_elem(elem):
         """Parse an individual table cell"""
+
+        self.logger.debug("Prasing element: %s", elem.get())
+
         # some cells include the club classification in the national league in brackets. for example, "Leeds (10.)"
         # these are at the same time unncessary and annoying to parse, as club information can be obtained
         # from the "shield" image. identify these cells by looking for descendents of the class 'tabellenplatz'
         has_classification_in_brackets = elem.xpath('*[@class = "tabellenplatz"]').get() is not None
         # club information is parsed from team "shields" using a separate logic from the rest
         # identify cells containing club shields
         has_shield_class = elem.css('img::attr(src)').get() is not None
-        club_href = elem.css('a.vereinprofil_tooltip::attr(href)').get()
+        club_href = elem.xpath('tm-tooltip[@data-type="club"]/a/@href').get()
         result_href = elem.css('a.ergebnis-link::attr(href)').get()
-
+
+        self.logger.debug("Extracted values: has_shield_class: %s, club_href: %s, result_href: %s", has_shield_class, club_href, result_href)
+
         if (
             (has_classification_in_brackets and club_href is None) or
             (club_href is not None and not has_shield_class) 
             ):
+          self.logger.debug("Found club href without shield class, skipping")
           return None
         elif club_href is not None:
+          self.logger.debug("Found club href: %s", club_href)
           return {'type': 'club', 'href': club_href}
         elif result_href is not None:
+          self.logger.debug("Found result/game href: %s", result_href)
           return {'type': 'game', 'href': result_href}
         # finally, most columns can be parsed by extracting the text at the element's "last leaf"
         else:
-          return elem.xpath('string(.)').get().strip()
+          extracted_element = elem.xpath('string(.)').get().strip()
+          self.logger.debug("Extracted element: %s", extracted_element)
+          return extracted_element
 
     # stats tables are 'responsive-tables' (except the first one, which is
     # a summary table)
     competitions = response.css(
         'div.table-header > a::attr(name)'
-    ).getall()[1:]
+    ).getall()
     stats_tables = response.css('div.responsive-table')[1:]
     assert(len(competitions) == len(stats_tables))
     all_stats = {}

diff --git a/tfmkt/spiders/players.py b/tfmkt/spiders/players.py
@@ -1,7 +1,5 @@
 from tfmkt.spiders.common import BaseSpider
 from scrapy.shell import inspect_response # required for debugging
-import re
-from inflection import parameterize, underscore
 
 class PlayersSpider(BaseSpider):
   name = 'players'
@@ -14,13 +12,17 @@ def parse(self, response, parent):
         @cb_kwargs {"parent": "dummy"}
       """
 
-      player_hrefs = response.css(
-            'a.spielprofil_tooltip::attr(href)'
-        ).getall()
+      # inspect_response(response, self)
+      # exit(1)
 
-      without_duplicates = list(set(player_hrefs))
+      players_table = response.xpath("//div[@class='responsive-table']")
+      assert len(players_table) == 1
 
-      for href in without_duplicates:
+      players_table = players_table[0]
+
+      player_hrefs = players_table.xpath("//tm-tooltip[@data-type='player']/div[1]/span/a/@href").getall()
+
+      for href in player_hrefs:
 
           cb_kwargs = {
             'base' : {