diff --git a/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb index 2fd6442..b3aba0d 100644 --- a/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb +++ b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki-checkpoint.ipynb @@ -1,6 +1,353 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#https://en.wikipedia.org/wiki\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import pickle\n", + "import datetime\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def name_formatter(name):\n", + " name = re.sub(\"_\", \" \", name).strip().title()\n", + " return(name)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Mohammad Najibullah'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name_formatter(\"Mohammad_Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def ar_lookup(name):\n", + " \"\"\"\n", + " Convert names from English to Arabic using European Media Monitors and \n", + " code modified from Phil Schrodt.\n", + " \"\"\"\n", + " name = name_formatter(name)\n", + " base_url = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/entities?query=\"\n", + " url = base_url + name\n", + " \n", + " try:\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, \"lxml\")\n", + " name_url = soup.find(\"p\", {\"class\" : \"center_headline\"}).find(\"a\")['href']\n", + " except Exception as e:\n", + " #print(\"Couldn't get page of results back: \", e)\n", + " return []\n", + "\n", + " try:\n", + " base = \"http://emm.newsexplorer.eu/NewsExplorer/search/en/\"\n", + " name_url = base + name_url\n", + " name_page = requests.get(name_url)\n", + " soup = BeautifulSoup(name_page.content, \"lxml\")\n", + " # check to make sure in list???? Take in alt names???/petrarch2/petrarch2/data/dictionaries\n", + " names = soup.find(\"td\", {\"colspan\" : \"1\"}).find_all(\"p\")\n", + " names = [i.text for i in names][1:]\n", + " names_en = [i for i in names if re.search(\"\\(.*?Eu|\\(.*?en\", i)]\n", + " names_en = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_en]\n", + " #print(\"Found match. Matched English name: \", names_en[0])\n", + " names_ar = [i for i in names if re.search(\"\\(.*?ar\", i)]\n", + " names_ar = [re.sub(\"\\s+?\\(.+?\\)\", \"\", name) for name in names_ar]\n", + " return names_ar\n", + " except Exception:\n", + " traceback.print_exc()\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#this will need the user to pass in an english name and come back with an arabic name\n", + "def hack_wiki(eng_name):\n", + " base_url=\"https://en.wikipedia.org/wiki/\"+eng_name\n", + " try:\n", + " page=requests.get(base_url)\n", + " soup=BeautifulSoup(page.content,\"lxml\")\n", + " name=soup.find(id=\"firstHeading\").contents\n", + " ar_url=soup.find(\"li\",{\"class\":\"interwiki-ar\"}).find(\"a\")['href']\n", + " #print(ar_url)\n", + " ar_page=requests.get(ar_url)\n", + " ar_soup=BeautifulSoup(ar_page.content,\"lxml\")\n", + " ar_name=ar_soup.find(id=\"firstHeading\").contents\n", + " print(ar_name)\n", + " #print(\"name \"+name+\" url: \"+str(ar_url))\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['محمد نجيب الله']\n" + ] + } + ], + "source": [ + "hack_wiki(\"Mohammad_Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", + "page=requests.get(base_url)\n", + "soup=BeautifulSoup(page.content,\"lxml\")\n", + "lists=soup.select(\".infobox tr th a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['President of Afghanistan']\n", + "[\"General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\"]\n", + "['Director of the State Intelligence Agency']\n", + "['Alma mater']\n" + ] + } + ], + "source": [ + "for item in lists:\n", + " print(item.contents)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30 September 1987– 16 April 1992\n", + "4 May 1986– 16 April 1992\n", + "11 January 1980– 21 November 1985\n" + ] + } + ], + "source": [ + "base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n", + "page=requests.get(base_url)\n", + "soup=BeautifulSoup(page.content,\"lxml\")\n", + "#lists=soup.select(\".infobox tr th a\")\n", + "lists=soup.find(\"table\",{\"class\":\"infobox\"}).find_all(\"tr\")\n", + "for item in lists:\n", + " if(item.find(\"td\") is not None):\n", + " if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}) is not None):\n", + " if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent is not None):\n", + " #if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).find(\"td\") is not None):\n", + " tempstr=item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent.contents\n", + " strstr=str(tempstr)\n", + " if(\"display:none\" not in strstr):\n", + " print(strstr.split('\\\\n')[1][:-2].replace(\"\\\\xa0\",''))\n", + " \n", + "#$('.infobox tr th a').closest(\"tr\").next(\"tr\").find(\".nowrap\").closest(\"td\").each(function(){console.log($(this).text())})" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[