diff --git a/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb index 2fd6442..b3aba0d 100644 --- a/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb +++ b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb @@ -1,6 +1,353 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#https://en.wikipedia.org/wiki\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import pickle\n", + "import datetime\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def name_formatter(name):\n", + " name = re.sub(\"_\", \" \", name).strip().title()\n", + " return(name)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Mohammad Najibullah'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name_formatter(\"Mohammad_Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def ar_lookup(name):\n", + " \"\"\"\n", + " Convert names from English to Arabic using European Media Monitors and \n", + " code modified from Phil Schrodt.\n", + " \"\"\"\n", + " name = name_formatter(name)\n", + " base_url = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/entities?query=\"\n", + " url = base_url + name\n", + " \n", + " try:\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, \"lxml\")\n", + " name_url = soup.find(\"p\", {\"class\" : \"center_headline\"}).find(\"a\")['href']\n", + " except Exception as e:\n", + " #print(\"Couldn't get page of results back: \", e)\n", + " return []\n", + "\n", + " try:\n", + " base = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/\"\n", + " name_url = base + name_url\n", + " name_page = requests.get(name_url)\n", + " soup = BeautifulSoup(name_page.content, \"lxml\")\n", + " # check to make sure in list???? Take in alt names???/petrarch2/petrarch2/data/dictionaries\n", + " names = soup.find(\"td\", {\"colspan\" : \"1\"}).find_all(\"p\")\n", + " names = [i.text for i in names][1:]\n", + " names_en = [i for i in names if re.search(\"\\(.*?Eu|\\(.*?en\", i)]\n", + " names_en = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_en]\n", + " #print(\"Found match. Matched English name: \", names_en[0])\n", + " names_ar = [i for i in names if re.search(\"\\(.*?ar\", i)]\n", + " names_ar = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_ar]\n", + " return names_ar\n", + " except Exception:\n", + " traceback.print_exc()\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#this will need the user to pass in an english name and come back with an arabic name\n", + "def hack_wiki(eng_name):\n", + " base_url=\"https://en.wikipedia.org/wiki/\"+eng_name\n", + " try:\n", + " page=requests.get(base_url)\n", + " soup=BeautifulSoup(page.content,\"lxml\")\n", + " name=soup.find(id=\"firstHeading\").contents\n", + " ar_url=soup.find(\"li\",{\"class\":\"interwiki-ar\"}).find(\"a\")['href']\n", + " #print(ar_url)\n", + " ar_page=requests.get(ar_url)\n", + " ar_soup=BeautifulSoup(ar_page.content,\"lxml\")\n", + " ar_name=ar_soup.find(id=\"firstHeading\").contents\n", + " print(ar_name)\n", + " #print(\"name \"+name+\" url: \"+str(ar_url))\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['محمد نجيب الله']\n" + ] + } + ], + "source": [ + "hack_wiki(\"Mohammad_Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", + "page=requests.get(base_url)\n", + "soup=BeautifulSoup(page.content,\"lxml\")\n", + "lists=soup.select(\".infobox tr th a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['President of Afghanistan']\n", + "[\"General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\"]\n", + "['Director of the State Intelligence Agency']\n", + "['Alma mater']\n" + ] + } + ], + "source": [ + "for item in lists:\n", + " print(item.contents)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30 September 1987– 16 April 1992\n", + "4 May 1986– 16 April 1992\n", + "11 January 1980– 21 November 1985\n" + ] + } + ], + "source": [ + "base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", + "page=requests.get(base_url)\n", + "soup=BeautifulSoup(page.content,\"lxml\")\n", + "#lists=soup.select(\".infobox tr th a\")\n", + "lists=soup.find(\"table\",{\"class\":\"infobox\"}).find_all(\"tr\")\n", + "for item in lists:\n", + " if(item.find(\"td\") is not None):\n", + " if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}) is not None):\n", + " if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent is not None):\n", + " #if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).find(\"td\") is not None):\n", + " tempstr=item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent.contents\n", + " strstr=str(tempstr)\n", + " if(\"display:none\" not in strstr):\n", + " print(strstr.split('\\\\n')[1][:-2].replace(\"\\\\xa0\",''))\n", + " \n", + "#$('.infobox tr th a').closest(\"tr\").next(\"tr\").find(\".nowrap\").closest(\"td\").each(function(){console.log($(this).text())})" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[\n", + " Dr Najibullah Ahmadzai\n", + " , \n", + " \"Najib.jpg\"\n", + " , \n", + " \n", + " , \n", + " President of Afghanistan\n", + " , \n", + " In office
\n", + " 30 September 1987 – 16 April 1992\n", + " , \n", + " Prime Minister\n", + " Sultan Ali Keshtmand
\n", + " Mohammad Hasan Sharq
\n", + " Sultan Ali Keshtmand
\n", + " Fazal Haq Khaliqyar\n", + " , \n", + " Preceded by\n", + " Haji Mohammad Chamkani\n", + " , \n", + " Succeeded by\n", + " Abdul Rahim Hatif (acting)\n", + " , \n", + " General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\n", + " , \n", + " In office
\n", + " 4 May 1986 – 16 April 1992\n", + " , \n", + " Preceded by\n", + " Babrak Karmal\n", + " , \n", + " Succeeded by\n", + " Position abolished\n", + " , \n", + " Director of the State Intelligence Agency\n", + " , \n", + " In office
\n", + " 11 January 1980 – 21 November 1985\n", + " , \n", + " President\n", + " Babrak Karmal\n", + " , \n", + " Prime Minister\n", + " Babrak Karmal
\n", + " Sultan Ali Keshtmand\n", + " , \n", + " Preceded by\n", + " Assadullah Sarwari\n", + " , \n", + " Succeeded by\n", + " Ghulam Faruq Yaqubi\n", + " , \n", + " \n", + " , \n", + " Personal details\n", + " , \n", + " Born\n", + " February 1947
\n", + " Paktia, Afghanistan\n", + " , \n", + " Died\n", + " 28 September 1996(1996-09-28) (aged 49)
\n", + " Kabul, Afghanistan\n", + " , \n", + " Political party\n", + " People's Democratic Party of Afghanistan
\n", + " (Parcham)\n", + " , \n", + " Spouse(s)\n", + " Dr. Fatana Najib\n", + " , \n", + " Children\n", + " three daughters\n", + " , \n", + " Alma mater\n", + " Kabul University\n", + " ]" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lists" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "print(lists)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, "nbformat": 4, "nbformat_minor": 2 }