diff --git a/members/wikipedia-mla.xml b/members/wikipedia-mla.xml
index c644bced..b09c166b 100644
--- a/members/wikipedia-mla.xml
+++ b/members/wikipedia-mla.xml
@@ -1,4 +1,4 @@
-
+
@@ -31,7 +31,7 @@
-
+
@@ -59,7 +59,7 @@
-
+
@@ -98,7 +98,7 @@
-
+
@@ -113,6 +113,7 @@
+
@@ -123,7 +124,7 @@
-
+
@@ -143,13 +144,16 @@
+
-
+
+
+
@@ -174,16 +178,21 @@
+
+
+
+
+
@@ -224,7 +233,45 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/members/wikipedia-msp.xml b/members/wikipedia-msp.xml
index 50f7ac15..6fb7c2b4 100644
--- a/members/wikipedia-msp.xml
+++ b/members/wikipedia-msp.xml
@@ -1,4 +1,4 @@
-
+
@@ -16,6 +16,7 @@
+
@@ -56,7 +57,7 @@
-
+
@@ -83,7 +84,7 @@
-
+
@@ -95,7 +96,7 @@
-
+
@@ -254,6 +255,8 @@
+
+
@@ -279,7 +282,7 @@
-
+
@@ -302,8 +305,47 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pyscraper/ni/wikipedia-mla.py b/pyscraper/ni/wikipedia-mla.py
index dea36e57..d3eb309d 100755
--- a/pyscraper/ni/wikipedia-mla.py
+++ b/pyscraper/ni/wikipedia-mla.py
@@ -7,11 +7,13 @@
# certain conditions. However, it comes with ABSOLUTELY NO WARRANTY.
# For details see the file LICENSE.html in the top level of the source.
+import os
import sys
import urllib.parse
import re
-sys.path.extend((".", ".."))
+file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')
+sys.path.insert(0, file_dir)
from ni.resolvenames import memberList
wiki_index_url = "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly"
@@ -19,24 +21,18 @@
# Grab pages
def read(y):
- with open('../rawdata/Members_of_the_NIA_%d' % y) as ur:
+ with open(file_dir + '/../rawdata/Members_of_the_NIA_%d' % y) as ur:
return ur.read()
-content = read(2003) + read(2007) + read(2011) + read(2016) + read(2017)
+content = read(2003) + read(2007) + read(2011) + read(2016) + read(2017) + read(2022)
matches = set()
# Links from all pages
-matcher = '
\s+]*>([^<]+)\s* | \s+]*>([^<]+)(?: \(Leader\))?\s* | '
+matcher = '
\s+]*>([^<]+)\s* | \s+]*>([^<]+)\s* | '
matches.update(re.findall(matcher, content))
-# 3rd Assembly replacements
-matcher = '
\s+]*>([^<]+) \((?:resigned|deceased)\), replaced by ]*>([^<]+)\s* | \s+([^<]+)\s* | '
-for m in re.findall(matcher, content):
- matches.add( (m[0], m[1], m[4]) )
- matches.add( (m[2], m[3], m[4]) )
-
# 4-6th Assembly
-matcher = '([^<]+)\s* | \s*()\s*\s*\s*
'
+matcher = '([^<]+)[^<]* | \s*()\s*\s*\s*'
matches.update(re.findall(matcher, content))
# 4-6th Assembly changes
diff --git a/pyscraper/sp/wikipedia-msp.py b/pyscraper/sp/wikipedia-msp.py
index 05cec801..a1e63808 100755
--- a/pyscraper/sp/wikipedia-msp.py
+++ b/pyscraper/sp/wikipedia-msp.py
@@ -14,7 +14,8 @@
import urllib.parse
import re
-sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..'))
+file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')
+sys.path.insert(0, file_dir)
from sp.resolvenames import memberList
date_today = datetime.date.today().isoformat()
@@ -26,6 +27,7 @@
"http://en.wikipedia.org/wiki/Members_of_the_3rd_Scottish_Parliament",
"http://en.wikipedia.org/wiki/Members_of_the_4th_Scottish_Parliament",
"http://en.wikipedia.org/wiki/Members_of_the_5th_Scottish_Parliament",
+ "http://en.wikipedia.org/wiki/Members_of_the_6th_Scottish_Parliament",
]
wikimembers = {}
@@ -33,17 +35,13 @@
for u in wiki_index_urls:
leaf = re.sub('.*/','',u)
- ur = open('../../rawdata/' + leaf)
+ ur = open(file_dir + '/../rawdata/' + leaf)
content += ur.read()
ur.close()
matcher = '(?ims)]*?title="[^"]+"[^>]*>([^<]+)'
matches = re.findall(matcher, content)
-matches.append(('/wiki/Dorothy_Grace_Elder','Dorothy-Grace Elder'))
-matches.append(('/wiki/Chris_Harvie', 'Christopher Harvie'))
-matches.append(('/wiki/Nicholas_Johnston', 'Nick Johnston'))
-
for (url, name) in matches:
id_list = None
try:
diff --git a/scripts/weeklyupdate b/scripts/weeklyupdate
index 54ee3d6e..9d21c288 100755
--- a/scripts/weeklyupdate
+++ b/scripts/weeklyupdate
@@ -24,16 +24,18 @@ curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2
curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2005" > Members_of_the_House_of_Commons_2005
curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2001" > Members_of_the_House_of_Commons_2001
curl -s "https://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_1997" > Members_of_the_House_of_Commons_1997
-curl -s "https://en.wikipedia.org/wiki/Members_of_the_2nd_Northern_Ireland_Assembly" > Members_of_the_NIA_2003
-curl -s "https://en.wikipedia.org/wiki/Members_of_the_3rd_Northern_Ireland_Assembly" > Members_of_the_NIA_2007
-curl -s "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly" > Members_of_the_NIA_2011
-curl -s "https://en.wikipedia.org/wiki/Members_of_the_5th_Northern_Ireland_Assembly" > Members_of_the_NIA_2016
-curl -s "https://en.wikipedia.org/wiki/Members_of_the_6th_Northern_Ireland_Assembly" > Members_of_the_NIA_2017
+curl -s "https://en.wikipedia.org/wiki/2nd_Northern_Ireland_Assembly" > Members_of_the_NIA_2003
+curl -s "https://en.wikipedia.org/wiki/3rd_Northern_Ireland_Assembly" > Members_of_the_NIA_2007
+curl -s "https://en.wikipedia.org/wiki/4th_Northern_Ireland_Assembly" > Members_of_the_NIA_2011
+curl -s "https://en.wikipedia.org/wiki/5th_Northern_Ireland_Assembly" > Members_of_the_NIA_2016
+curl -s "https://en.wikipedia.org/wiki/6th_Northern_Ireland_Assembly" > Members_of_the_NIA_2017
+curl -s 'https://en.wikipedia.org/wiki/7th_Northern_Ireland_Assembly' > Members_of_the_NIA_2022
curl -s "https://en.wikipedia.org/wiki/1st_Scottish_Parliament" > Members_of_the_1st_Scottish_Parliament
curl -s "https://en.wikipedia.org/wiki/2nd_Scottish_Parliament" > Members_of_the_2nd_Scottish_Parliament
curl -s "https://en.wikipedia.org/wiki/3rd_Scottish_Parliament" > Members_of_the_3rd_Scottish_Parliament
curl -s "https://en.wikipedia.org/wiki/4th_Scottish_Parliament" > Members_of_the_4th_Scottish_Parliament
curl -s "https://en.wikipedia.org/wiki/5th_Scottish_Parliament" > Members_of_the_5th_Scottish_Parliament
+curl -s "https://en.wikipedia.org/wiki/6th_Scottish_Parliament" > Members_of_the_6th_Scottish_Parliament
# curl -s "https://en.wikipedia.org/wiki/MPs_standing_down_in_the_United_Kingdom_general_election,_2010" > MPs_standing_down_in_2010
#svn -q commit -m "Weekly rawdata scrape commit"
cd ~/parlparse/members
@@ -43,19 +45,9 @@ cd ~/parlparse/members
# ./bbcconv.py > bbc-links.xml
cd ~/parlparse/pyscraper
ni/wikipedia-mla.py > ../members/wikipedia-mla.xml
-cd ~/parlparse/pyscraper/sp
-./wikipedia-msp.py > ../../members/wikipedia-msp.xml
+sp/wikipedia-msp.py > ../members/wikipedia-msp.xml
#cd ~/parlparse/pyscraper
#./get_links_from_ep.py
-#cd ~/parlparse/members
-#svn -q commit -m "Weekly members scrape commit"
-
-# Force reindex, to find URLs which have changed (updates to Hansard,
-# including the once-per-session grand URL-rename-and-break from cm to vo)
-# Run scraper for all of parliament
-# cd ~/parlparse/scripts
-# ./updatedaterange-scrape $WEEKLY_FROMDATE $LORDS_WEEKLY_FROMDATE $WEEKLY_TODATE --force-index
-# ./updatedaterange-parse $WEEKLY_FROMDATE $LORDS_WEEKLY_FROMDATE $WEEKLY_TODATE
# Zip up XML files for people
cd ~/parldata/scrapedxml