diff --git a/members/wikipedia-mla.xml b/members/wikipedia-mla.xml index c644bced..b09c166b 100644 --- a/members/wikipedia-mla.xml +++ b/members/wikipedia-mla.xml @@ -1,4 +1,4 @@ - + @@ -31,7 +31,7 @@ - + @@ -59,7 +59,7 @@ - + @@ -98,7 +98,7 @@ - + @@ -113,6 +113,7 @@ + @@ -123,7 +124,7 @@ - + @@ -143,13 +144,16 @@ + - + + + @@ -174,16 +178,21 @@ + + + + + @@ -224,7 +233,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/members/wikipedia-msp.xml b/members/wikipedia-msp.xml index 50f7ac15..6fb7c2b4 100644 --- a/members/wikipedia-msp.xml +++ b/members/wikipedia-msp.xml @@ -1,4 +1,4 @@ - + @@ -16,6 +16,7 @@ + @@ -56,7 +57,7 @@ - + @@ -83,7 +84,7 @@ - + @@ -95,7 +96,7 @@ - + @@ -254,6 +255,8 @@ + + @@ -279,7 +282,7 @@ - + @@ -302,8 +305,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pyscraper/ni/wikipedia-mla.py b/pyscraper/ni/wikipedia-mla.py index dea36e57..d3eb309d 100755 --- a/pyscraper/ni/wikipedia-mla.py +++ b/pyscraper/ni/wikipedia-mla.py @@ -7,11 +7,13 @@ # certain conditions. However, it comes with ABSOLUTELY NO WARRANTY. # For details see the file LICENSE.html in the top level of the source. +import os import sys import urllib.parse import re -sys.path.extend((".", "..")) +file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..') +sys.path.insert(0, file_dir) from ni.resolvenames import memberList wiki_index_url = "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly" @@ -19,24 +21,18 @@ # Grab pages def read(y): - with open('../rawdata/Members_of_the_NIA_%d' % y) as ur: + with open(file_dir + '/../rawdata/Members_of_the_NIA_%d' % y) as ur: return ur.read() -content = read(2003) + read(2007) + read(2011) + read(2016) + read(2017) +content = read(2003) + read(2007) + read(2011) + read(2016) + read(2017) + read(2022) matches = set() # Links from all pages -matcher = '\s+]*>([^<]+)\s*\s+]*>([^<]+)(?: \(Leader\))?\s*' +matcher = '\s+]*>([^<]+)\s*\s+]*>([^<]+)\s*' matches.update(re.findall(matcher, content)) -# 3rd Assembly replacements -matcher = '\s+]*>([^<]+) \((?:resigned|deceased)\), replaced by ]*>([^<]+)\s*\s+([^<]+)\s*' -for m in re.findall(matcher, content): - matches.add( (m[0], m[1], m[4]) ) - matches.add( (m[2], m[3], m[4]) ) - # 4-6th Assembly -matcher = '([^<]+)\s*\s*()\s*\s*\s*' +matcher = '([^<]+)[^<]*\s*()\s*\s*\s*' matches.update(re.findall(matcher, content)) # 4-6th Assembly changes diff --git a/pyscraper/sp/wikipedia-msp.py b/pyscraper/sp/wikipedia-msp.py index 05cec801..a1e63808 100755 --- a/pyscraper/sp/wikipedia-msp.py +++ b/pyscraper/sp/wikipedia-msp.py @@ -14,7 +14,8 @@ import urllib.parse import re -sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')) +file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..') +sys.path.insert(0, file_dir) from sp.resolvenames import memberList date_today = datetime.date.today().isoformat() @@ -26,6 +27,7 @@ "http://en.wikipedia.org/wiki/Members_of_the_3rd_Scottish_Parliament", "http://en.wikipedia.org/wiki/Members_of_the_4th_Scottish_Parliament", "http://en.wikipedia.org/wiki/Members_of_the_5th_Scottish_Parliament", + "http://en.wikipedia.org/wiki/Members_of_the_6th_Scottish_Parliament", ] wikimembers = {} @@ -33,17 +35,13 @@ for u in wiki_index_urls: leaf = re.sub('.*/','',u) - ur = open('../../rawdata/' + leaf) + ur = open(file_dir + '/../rawdata/' + leaf) content += ur.read() ur.close() matcher = '(?ims)]*?title="[^"]+"[^>]*>([^<]+)' matches = re.findall(matcher, content) -matches.append(('/wiki/Dorothy_Grace_Elder','Dorothy-Grace Elder')) -matches.append(('/wiki/Chris_Harvie', 'Christopher Harvie')) -matches.append(('/wiki/Nicholas_Johnston', 'Nick Johnston')) - for (url, name) in matches: id_list = None try: diff --git a/scripts/weeklyupdate b/scripts/weeklyupdate index 54ee3d6e..9d21c288 100755 --- a/scripts/weeklyupdate +++ b/scripts/weeklyupdate @@ -24,16 +24,18 @@ curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2 curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2005" > Members_of_the_House_of_Commons_2005 curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2001" > Members_of_the_House_of_Commons_2001 curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_1997" > Members_of_the_House_of_Commons_1997 -curl -s "https://en.wikipedia.org/wiki/Members_of_the_2nd_Northern_Ireland_Assembly" > Members_of_the_NIA_2003 -curl -s "https://en.wikipedia.org/wiki/Members_of_the_3rd_Northern_Ireland_Assembly" > Members_of_the_NIA_2007 -curl -s "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly" > Members_of_the_NIA_2011 -curl -s "https://en.wikipedia.org/wiki/Members_of_the_5th_Northern_Ireland_Assembly" > Members_of_the_NIA_2016 -curl -s "https://en.wikipedia.org/wiki/Members_of_the_6th_Northern_Ireland_Assembly" > Members_of_the_NIA_2017 +curl -s "https://en.wikipedia.org/wiki/2nd_Northern_Ireland_Assembly" > Members_of_the_NIA_2003 +curl -s "https://en.wikipedia.org/wiki/3rd_Northern_Ireland_Assembly" > Members_of_the_NIA_2007 +curl -s "https://en.wikipedia.org/wiki/4th_Northern_Ireland_Assembly" > Members_of_the_NIA_2011 +curl -s "https://en.wikipedia.org/wiki/5th_Northern_Ireland_Assembly" > Members_of_the_NIA_2016 +curl -s "https://en.wikipedia.org/wiki/6th_Northern_Ireland_Assembly" > Members_of_the_NIA_2017 +curl -s 'https://en.wikipedia.org/wiki/7th_Northern_Ireland_Assembly' > Members_of_the_NIA_2022 curl -s "https://en.wikipedia.org/wiki/1st_Scottish_Parliament" > Members_of_the_1st_Scottish_Parliament curl -s "https://en.wikipedia.org/wiki/2nd_Scottish_Parliament" > Members_of_the_2nd_Scottish_Parliament curl -s "https://en.wikipedia.org/wiki/3rd_Scottish_Parliament" > Members_of_the_3rd_Scottish_Parliament curl -s "https://en.wikipedia.org/wiki/4th_Scottish_Parliament" > Members_of_the_4th_Scottish_Parliament curl -s "https://en.wikipedia.org/wiki/5th_Scottish_Parliament" > Members_of_the_5th_Scottish_Parliament +curl -s "https://en.wikipedia.org/wiki/6th_Scottish_Parliament" > Members_of_the_6th_Scottish_Parliament # curl -s "https://en.wikipedia.org/wiki/MPs_standing_down_in_the_United_Kingdom_general_election,_2010" > MPs_standing_down_in_2010 #svn -q commit -m "Weekly rawdata scrape commit" cd ~/parlparse/members @@ -43,19 +45,9 @@ cd ~/parlparse/members # ./bbcconv.py > bbc-links.xml cd ~/parlparse/pyscraper ni/wikipedia-mla.py > ../members/wikipedia-mla.xml -cd ~/parlparse/pyscraper/sp -./wikipedia-msp.py > ../../members/wikipedia-msp.xml +sp/wikipedia-msp.py > ../members/wikipedia-msp.xml #cd ~/parlparse/pyscraper #./get_links_from_ep.py -#cd ~/parlparse/members -#svn -q commit -m "Weekly members scrape commit" - -# Force reindex, to find URLs which have changed (updates to Hansard, -# including the once-per-session grand URL-rename-and-break from cm to vo) -# Run scraper for all of parliament -# cd ~/parlparse/scripts -# ./updatedaterange-scrape $WEEKLY_FROMDATE $LORDS_WEEKLY_FROMDATE $WEEKLY_TODATE --force-index -# ./updatedaterange-parse $WEEKLY_FROMDATE $LORDS_WEEKLY_FROMDATE $WEEKLY_TODATE # Zip up XML files for people cd ~/parldata/scrapedxml