Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
YanLiang1102 committed Jun 29, 2017
1 parent 78cbe0d commit 9bd7b8f
Showing 1 changed file with 349 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,353 @@
{
"cells": [],
"metadata": {},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#https://en.wikipedia.org/wiki\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"import json\n",
"import pickle\n",
"import datetime\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def name_formatter(name):\n",
" name = re.sub(\"_\", \" \", name).strip().title()\n",
" return(name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Mohammad Najibullah'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"name_formatter(\"Mohammad_Najibullah\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ar_lookup(name):\n",
" \"\"\"\n",
" Convert names from English to Arabic using European Media Monitors and \n",
" code modified from Phil Schrodt.\n",
" \"\"\"\n",
" name = name_formatter(name)\n",
" base_url = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/entities?query=\"\n",
" url = base_url + name\n",
" \n",
" try:\n",
" page = requests.get(url)\n",
" soup = BeautifulSoup(page.content, \"lxml\")\n",
" name_url = soup.find(\"p\", {\"class\" : \"center_headline\"}).find(\"a\")['href']\n",
" except Exception as e:\n",
" #print(\"Couldn't get page of results back: \", e)\n",
" return []\n",
"\n",
" try:\n",
" base = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/\"\n",
" name_url = base + name_url\n",
" name_page = requests.get(name_url)\n",
" soup = BeautifulSoup(name_page.content, \"lxml\")\n",
" # check to make sure in list???? Take in alt names???/petrarch2/petrarch2/data/dictionaries\n",
" names = soup.find(\"td\", {\"colspan\" : \"1\"}).find_all(\"p\")\n",
" names = [i.text for i in names][1:]\n",
" names_en = [i for i in names if re.search(\"\\(.*?Eu|\\(.*?en\", i)]\n",
" names_en = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_en]\n",
" #print(\"Found match. Matched English name: \", names_en[0])\n",
" names_ar = [i for i in names if re.search(\"\\(.*?ar\", i)]\n",
" names_ar = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_ar]\n",
" return names_ar\n",
" except Exception:\n",
" traceback.print_exc()\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#this will need the user to pass in an english name and come back with an arabic name\n",
"def hack_wiki(eng_name):\n",
" base_url=\"https://en.wikipedia.org/wiki/\"+eng_name\n",
" try:\n",
" page=requests.get(base_url)\n",
" soup=BeautifulSoup(page.content,\"lxml\")\n",
" name=soup.find(id=\"firstHeading\").contents\n",
" ar_url=soup.find(\"li\",{\"class\":\"interwiki-ar\"}).find(\"a\")['href']\n",
" #print(ar_url)\n",
" ar_page=requests.get(ar_url)\n",
" ar_soup=BeautifulSoup(ar_page.content,\"lxml\")\n",
" ar_name=ar_soup.find(id=\"firstHeading\").contents\n",
" print(ar_name)\n",
" #print(\"name \"+name+\" url: \"+str(ar_url))\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['محمد نجيب الله']\n"
]
}
],
"source": [
"hack_wiki(\"Mohammad_Najibullah\")"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n",
"page=requests.get(base_url)\n",
"soup=BeautifulSoup(page.content,\"lxml\")\n",
"lists=soup.select(\".infobox tr th a\")"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['President of Afghanistan']\n",
"[\"General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\"]\n",
"['Director of the State Intelligence Agency']\n",
"['Alma mater']\n"
]
}
],
"source": [
"for item in lists:\n",
" print(item.contents)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30 September 1987– 16 April 1992\n",
"4 May 1986– 16 April 1992\n",
"11 January 1980– 21 November 1985\n"
]
}
],
"source": [
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n",
"page=requests.get(base_url)\n",
"soup=BeautifulSoup(page.content,\"lxml\")\n",
"#lists=soup.select(\".infobox tr th a\")\n",
"lists=soup.find(\"table\",{\"class\":\"infobox\"}).find_all(\"tr\")\n",
"for item in lists:\n",
" if(item.find(\"td\") is not None):\n",
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}) is not None):\n",
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent is not None):\n",
" #if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).find(\"td\") is not None):\n",
" tempstr=item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent.contents\n",
" strstr=str(tempstr)\n",
" if(\"display:none\" not in strstr):\n",
" print(strstr.split('\\\\n')[1][:-2].replace(\"\\\\xa0\",''))\n",
" \n",
"#$('.infobox tr th a').closest(\"tr\").next(\"tr\").find(\".nowrap\").closest(\"td\").each(function(){console.log($(this).text())})"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<tr>\n",
" <th colspan=\"2\" style=\"text-align:center;font-size:125%;font-weight:bold;font-size: 130%;\"><span class=\"fn\">Dr Najibullah Ahmadzai</span></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center\"><a class=\"image\" href=\"/wiki/File:Najib.jpg\"><img alt=\"Najib.jpg\" data-file-height=\"351\" data-file-width=\"283\" height=\"273\" src=\"//upload.wikimedia.org/wikipedia/en/thumb/4/4c/Najib.jpg/220px-Najib.jpg\" srcset=\"//upload.wikimedia.org/wikipedia/en/4/4c/Najib.jpg 1.5x\" width=\"220\"/></a></td>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center\"></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/President_of_Afghanistan\" title=\"President of Afghanistan\">President of Afghanistan</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 30 September 1987 – 16 April 1992</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n",
" <td><a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n",
" <a href=\"/wiki/Mohammad_Hasan_Sharq\" title=\"Mohammad Hasan Sharq\">Mohammad Hasan Sharq</a><br/>\n",
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n",
" <a href=\"/wiki/Fazal_Haq_Khaliqyar\" title=\"Fazal Haq Khaliqyar\">Fazal Haq Khaliqyar</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Haji_Mohammad_Chamkani\" title=\"Haji Mohammad Chamkani\">Haji Mohammad Chamkani</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td><a href=\"/wiki/Abdul_Rahim_Hatif\" title=\"Abdul Rahim Hatif\">Abdul Rahim Hatif</a> <small>(acting)</small></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">General Secretary of the Central Committee of the People's Democratic Party of Afghanistan</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 4 May 1986 – 16 April 1992</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td>Position abolished</td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/KHAD\" title=\"KHAD\">Director of the State Intelligence Agency</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 11 January 1980 – 21 November 1985</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\">President</th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a><br/>\n",
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Assadullah_Sarwari\" title=\"Assadullah Sarwari\">Assadullah Sarwari</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td><a href=\"/wiki/Ghulam_Faruq_Yaqubi\" title=\"Ghulam Faruq Yaqubi\">Ghulam Faruq Yaqubi</a></td>\n",
" </tr>, <tr style=\"display:none\">\n",
" <td colspan=\"2\"></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\">Personal details</th>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Born</th>\n",
" <td>February 1947<br/>\n",
" <a class=\"mw-redirect\" href=\"/wiki/Paktia\" title=\"Paktia\">Paktia</a>, <a href=\"/wiki/Kingdom_of_Afghanistan\" title=\"Kingdom of Afghanistan\">Afghanistan</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Died</th>\n",
" <td><span class=\"nowrap\">28 September 1996<span style=\"display:none\">(<span class=\"dday deathdate\">1996-09-28</span>)</span> (aged 49)</span><br/>\n",
" <a href=\"/wiki/Kabul\" title=\"Kabul\">Kabul</a>, <a href=\"/wiki/Islamic_Emirate_of_Afghanistan\" title=\"Islamic Emirate of Afghanistan\">Afghanistan</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Political party</th>\n",
" <td><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">People's Democratic Party of Afghanistan</a><br/>\n",
" (<a href=\"/wiki/Parcham\" title=\"Parcham\">Parcham</a>)</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Spouse(s)</th>\n",
" <td>Dr. Fatana Najib</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Children</th>\n",
" <td>three daughters</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\"><a href=\"/wiki/Alma_mater\" title=\"Alma mater\">Alma mater</a></th>\n",
" <td><a href=\"/wiki/Kabul_University\" title=\"Kabul University\">Kabul University</a></td>\n",
" </tr>]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lists"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"print(lists)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 9bd7b8f

Please sign in to comment.