-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
78cbe0d
commit 9bd7b8f
Showing
1 changed file
with
349 additions
and
2 deletions.
There are no files selected for viewing
351 changes: 349 additions & 2 deletions
351
otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,353 @@ | ||
{ | ||
"cells": [], | ||
"metadata": {}, | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#https://en.wikipedia.org/wiki\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import re\n", | ||
"import json\n", | ||
"import pickle\n", | ||
"import datetime\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def name_formatter(name):\n", | ||
" name = re.sub(\"_\", \" \", name).strip().title()\n", | ||
" return(name)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'Mohammad Najibullah'" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"name_formatter(\"Mohammad_Najibullah\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def ar_lookup(name):\n", | ||
" \"\"\"\n", | ||
" Convert names from English to Arabic using European Media Monitors and \n", | ||
" code modified from Phil Schrodt.\n", | ||
" \"\"\"\n", | ||
" name = name_formatter(name)\n", | ||
" base_url = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/entities?query=\"\n", | ||
" url = base_url + name\n", | ||
" \n", | ||
" try:\n", | ||
" page = requests.get(url)\n", | ||
" soup = BeautifulSoup(page.content, \"lxml\")\n", | ||
" name_url = soup.find(\"p\", {\"class\" : \"center_headline\"}).find(\"a\")['href']\n", | ||
" except Exception as e:\n", | ||
" #print(\"Couldn't get page of results back: \", e)\n", | ||
" return []\n", | ||
"\n", | ||
" try:\n", | ||
" base = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/\"\n", | ||
" name_url = base + name_url\n", | ||
" name_page = requests.get(name_url)\n", | ||
" soup = BeautifulSoup(name_page.content, \"lxml\")\n", | ||
" # check to make sure in list???? Take in alt names???/petrarch2/petrarch2/data/dictionaries\n", | ||
" names = soup.find(\"td\", {\"colspan\" : \"1\"}).find_all(\"p\")\n", | ||
" names = [i.text for i in names][1:]\n", | ||
" names_en = [i for i in names if re.search(\"\\(.*?Eu|\\(.*?en\", i)]\n", | ||
" names_en = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_en]\n", | ||
" #print(\"Found match. Matched English name: \", names_en[0])\n", | ||
" names_ar = [i for i in names if re.search(\"\\(.*?ar\", i)]\n", | ||
" names_ar = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_ar]\n", | ||
" return names_ar\n", | ||
" except Exception:\n", | ||
" traceback.print_exc()\n", | ||
" return []" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 37, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#this will need the user to pass in an english name and come back with an arabic name\n", | ||
"def hack_wiki(eng_name):\n", | ||
" base_url=\"https://en.wikipedia.org/wiki/\"+eng_name\n", | ||
" try:\n", | ||
" page=requests.get(base_url)\n", | ||
" soup=BeautifulSoup(page.content,\"lxml\")\n", | ||
" name=soup.find(id=\"firstHeading\").contents\n", | ||
" ar_url=soup.find(\"li\",{\"class\":\"interwiki-ar\"}).find(\"a\")['href']\n", | ||
" #print(ar_url)\n", | ||
" ar_page=requests.get(ar_url)\n", | ||
" ar_soup=BeautifulSoup(ar_page.content,\"lxml\")\n", | ||
" ar_name=ar_soup.find(id=\"firstHeading\").contents\n", | ||
" print(ar_name)\n", | ||
" #print(\"name \"+name+\" url: \"+str(ar_url))\n", | ||
" except Exception as e:\n", | ||
" print(e)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 38, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['محمد نجيب الله']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"hack_wiki(\"Mohammad_Najibullah\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 118, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", | ||
"page=requests.get(base_url)\n", | ||
"soup=BeautifulSoup(page.content,\"lxml\")\n", | ||
"lists=soup.select(\".infobox tr th a\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 119, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['President of Afghanistan']\n", | ||
"[\"General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\"]\n", | ||
"['Director of the State Intelligence Agency']\n", | ||
"['Alma mater']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for item in lists:\n", | ||
" print(item.contents)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 117, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"30 September 1987– 16 April 1992\n", | ||
"4 May 1986– 16 April 1992\n", | ||
"11 January 1980– 21 November 1985\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", | ||
"page=requests.get(base_url)\n", | ||
"soup=BeautifulSoup(page.content,\"lxml\")\n", | ||
"#lists=soup.select(\".infobox tr th a\")\n", | ||
"lists=soup.find(\"table\",{\"class\":\"infobox\"}).find_all(\"tr\")\n", | ||
"for item in lists:\n", | ||
" if(item.find(\"td\") is not None):\n", | ||
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}) is not None):\n", | ||
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent is not None):\n", | ||
" #if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).find(\"td\") is not None):\n", | ||
" tempstr=item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent.contents\n", | ||
" strstr=str(tempstr)\n", | ||
" if(\"display:none\" not in strstr):\n", | ||
" print(strstr.split('\\\\n')[1][:-2].replace(\"\\\\xa0\",''))\n", | ||
" \n", | ||
"#$('.infobox tr th a').closest(\"tr\").next(\"tr\").find(\".nowrap\").closest(\"td\").each(function(){console.log($(this).text())})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 83, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[<tr>\n", | ||
" <th colspan=\"2\" style=\"text-align:center;font-size:125%;font-weight:bold;font-size: 130%;\"><span class=\"fn\">Dr Najibullah Ahmadzai</span></th>\n", | ||
" </tr>, <tr>\n", | ||
" <td colspan=\"2\" style=\"text-align:center\"><a class=\"image\" href=\"/wiki/File:Najib.jpg\"><img alt=\"Najib.jpg\" data-file-height=\"351\" data-file-width=\"283\" height=\"273\" src=\"//upload.wikimedia.org/wikipedia/en/thumb/4/4c/Najib.jpg/220px-Najib.jpg\" srcset=\"//upload.wikimedia.org/wikipedia/en/4/4c/Najib.jpg 1.5x\" width=\"220\"/></a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <td colspan=\"2\" style=\"text-align:center\"></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/President_of_Afghanistan\" title=\"President of Afghanistan\">President of Afghanistan</a></th>\n", | ||
" </tr>, <tr>\n", | ||
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n", | ||
" 30 September 1987 – 16 April 1992</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n", | ||
" <td><a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n", | ||
" <a href=\"/wiki/Mohammad_Hasan_Sharq\" title=\"Mohammad Hasan Sharq\">Mohammad Hasan Sharq</a><br/>\n", | ||
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n", | ||
" <a href=\"/wiki/Fazal_Haq_Khaliqyar\" title=\"Fazal Haq Khaliqyar\">Fazal Haq Khaliqyar</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n", | ||
" <td><a href=\"/wiki/Haji_Mohammad_Chamkani\" title=\"Haji Mohammad Chamkani\">Haji Mohammad Chamkani</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n", | ||
" <td><a href=\"/wiki/Abdul_Rahim_Hatif\" title=\"Abdul Rahim Hatif\">Abdul Rahim Hatif</a> <small>(acting)</small></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">General Secretary of the Central Committee of the People's Democratic Party of Afghanistan</a></th>\n", | ||
" </tr>, <tr>\n", | ||
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n", | ||
" 4 May 1986 – 16 April 1992</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n", | ||
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n", | ||
" <td>Position abolished</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/KHAD\" title=\"KHAD\">Director of the State Intelligence Agency</a></th>\n", | ||
" </tr>, <tr>\n", | ||
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n", | ||
" 11 January 1980 – 21 November 1985</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\">President</th>\n", | ||
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n", | ||
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a><br/>\n", | ||
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n", | ||
" <td><a href=\"/wiki/Assadullah_Sarwari\" title=\"Assadullah Sarwari\">Assadullah Sarwari</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n", | ||
" <td><a href=\"/wiki/Ghulam_Faruq_Yaqubi\" title=\"Ghulam Faruq Yaqubi\">Ghulam Faruq Yaqubi</a></td>\n", | ||
" </tr>, <tr style=\"display:none\">\n", | ||
" <td colspan=\"2\"></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th colspan=\"2\" style=\"text-align:center;background:lavender\">Personal details</th>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\">Born</th>\n", | ||
" <td>February 1947<br/>\n", | ||
" <a class=\"mw-redirect\" href=\"/wiki/Paktia\" title=\"Paktia\">Paktia</a>, <a href=\"/wiki/Kingdom_of_Afghanistan\" title=\"Kingdom of Afghanistan\">Afghanistan</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\">Died</th>\n", | ||
" <td><span class=\"nowrap\">28 September 1996<span style=\"display:none\">(<span class=\"dday deathdate\">1996-09-28</span>)</span> (aged 49)</span><br/>\n", | ||
" <a href=\"/wiki/Kabul\" title=\"Kabul\">Kabul</a>, <a href=\"/wiki/Islamic_Emirate_of_Afghanistan\" title=\"Islamic Emirate of Afghanistan\">Afghanistan</a></td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\">Political party</th>\n", | ||
" <td><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">People's Democratic Party of Afghanistan</a><br/>\n", | ||
" (<a href=\"/wiki/Parcham\" title=\"Parcham\">Parcham</a>)</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\">Spouse(s)</th>\n", | ||
" <td>Dr. Fatana Najib</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\">Children</th>\n", | ||
" <td>three daughters</td>\n", | ||
" </tr>, <tr>\n", | ||
" <th scope=\"row\"><a href=\"/wiki/Alma_mater\" title=\"Alma mater\">Alma mater</a></th>\n", | ||
" <td><a href=\"/wiki/Kabul_University\" title=\"Kabul University\">Kabul University</a></td>\n", | ||
" </tr>]" | ||
] | ||
}, | ||
"execution_count": 83, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"lists" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 61, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"None\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(lists)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |