From 00a46856abfd43cdb842310dc9b02bfc1ea6e6bd Mon Sep 17 00:00:00 2001 From: Diwakar Gupta <39624018+Diwakar-Gupta@users.noreply.github.com> Date: Sun, 14 Aug 2022 14:38:11 +0530 Subject: [PATCH] NLP intro --- 22-08-14-NLP/NLP_Intro_April.ipynb | 1681 ++++++++++++++++++++++++++++ 1 file changed, 1681 insertions(+) create mode 100644 22-08-14-NLP/NLP_Intro_April.ipynb diff --git a/22-08-14-NLP/NLP_Intro_April.ipynb b/22-08-14-NLP/NLP_Intro_April.ipynb new file mode 100644 index 0000000..5428303 --- /dev/null +++ b/22-08-14-NLP/NLP_Intro_April.ipynb @@ -0,0 +1,1681 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "NLP_Intro_April.ipynb", + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EtjVlIDUCt7C" + }, + "outputs": [], + "source": [ + "import nltk ## Natural Language ToolKit\n" + ] + }, + { + "cell_type": "code", + "source": [ + "nltk.download('all')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PpONcgZdD20G", + "outputId": "86abe33a-f529-4ca2-f371-fff051d57dfc" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading collection 'all'\n", + "[nltk_data] | \n", + "[nltk_data] | Downloading package abc to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/abc.zip.\n", + "[nltk_data] | Downloading package alpino to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/alpino.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip.\n", + "[nltk_data] | Downloading package averaged_perceptron_tagger_ru to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping\n", + "[nltk_data] | taggers/averaged_perceptron_tagger_ru.zip.\n", + "[nltk_data] | Downloading package basque_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/basque_grammars.zip.\n", + "[nltk_data] | Downloading package biocreative_ppi to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/biocreative_ppi.zip.\n", + "[nltk_data] | Downloading package bllip_wsj_no_aux to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/bllip_wsj_no_aux.zip.\n", + "[nltk_data] | Downloading package book_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/book_grammars.zip.\n", + "[nltk_data] | Downloading package brown to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/brown.zip.\n", + "[nltk_data] | Downloading package brown_tei to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/brown_tei.zip.\n", + "[nltk_data] | Downloading package cess_cat to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cess_cat.zip.\n", + "[nltk_data] | Downloading package cess_esp to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cess_esp.zip.\n", + "[nltk_data] | Downloading package chat80 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/chat80.zip.\n", + "[nltk_data] | Downloading package city_database to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/city_database.zip.\n", + "[nltk_data] | Downloading package cmudict to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/cmudict.zip.\n", + "[nltk_data] | Downloading package comparative_sentences to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/comparative_sentences.zip.\n", + "[nltk_data] | Downloading package comtrans to /root/nltk_data...\n", + "[nltk_data] | Downloading package conll2000 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/conll2000.zip.\n", + "[nltk_data] | Downloading package conll2002 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/conll2002.zip.\n", + "[nltk_data] | Downloading package conll2007 to /root/nltk_data...\n", + "[nltk_data] | Downloading package crubadan to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/crubadan.zip.\n", + "[nltk_data] | Downloading package dependency_treebank to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/dependency_treebank.zip.\n", + "[nltk_data] | Downloading package dolch to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/dolch.zip.\n", + "[nltk_data] | Downloading package europarl_raw to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/europarl_raw.zip.\n", + "[nltk_data] | Downloading package extended_omw to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package floresta to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/floresta.zip.\n", + "[nltk_data] | Downloading package framenet_v15 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/framenet_v15.zip.\n", + "[nltk_data] | Downloading package framenet_v17 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/framenet_v17.zip.\n", + "[nltk_data] | Downloading package gazetteers to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/gazetteers.zip.\n", + "[nltk_data] | Downloading package genesis to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/genesis.zip.\n", + "[nltk_data] | Downloading package gutenberg to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/gutenberg.zip.\n", + "[nltk_data] | Downloading package ieer to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ieer.zip.\n", + "[nltk_data] | Downloading package inaugural to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/inaugural.zip.\n", + "[nltk_data] | Downloading package indian to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/indian.zip.\n", + "[nltk_data] | Downloading package jeita to /root/nltk_data...\n", + "[nltk_data] | Downloading package kimmo to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/kimmo.zip.\n", + "[nltk_data] | Downloading package knbc to /root/nltk_data...\n", + "[nltk_data] | Downloading package large_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/large_grammars.zip.\n", + "[nltk_data] | Downloading package lin_thesaurus to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/lin_thesaurus.zip.\n", + "[nltk_data] | Downloading package mac_morpho to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/mac_morpho.zip.\n", + "[nltk_data] | Downloading package machado to /root/nltk_data...\n", + "[nltk_data] | Downloading package masc_tagged to /root/nltk_data...\n", + "[nltk_data] | Downloading package maxent_ne_chunker to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip.\n", + "[nltk_data] | Downloading package maxent_treebank_pos_tagger to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/maxent_treebank_pos_tagger.zip.\n", + "[nltk_data] | Downloading package moses_sample to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/moses_sample.zip.\n", + "[nltk_data] | Downloading package movie_reviews to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/movie_reviews.zip.\n", + "[nltk_data] | Downloading package mte_teip5 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/mte_teip5.zip.\n", + "[nltk_data] | Downloading package mwa_ppdb to /root/nltk_data...\n", + "[nltk_data] | Unzipping misc/mwa_ppdb.zip.\n", + "[nltk_data] | Downloading package names to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/names.zip.\n", + "[nltk_data] | Downloading package nombank.1.0 to /root/nltk_data...\n", + "[nltk_data] | Downloading package nonbreaking_prefixes to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/nonbreaking_prefixes.zip.\n", + "[nltk_data] | Downloading package nps_chat to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/nps_chat.zip.\n", + "[nltk_data] | Downloading package omw to /root/nltk_data...\n", + "[nltk_data] | Downloading package omw-1.4 to /root/nltk_data...\n", + "[nltk_data] | Downloading package opinion_lexicon to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/opinion_lexicon.zip.\n", + "[nltk_data] | Downloading package panlex_swadesh to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package paradigms to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/paradigms.zip.\n", + "[nltk_data] | Downloading package pe08 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pe08.zip.\n", + "[nltk_data] | Downloading package perluniprops to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping misc/perluniprops.zip.\n", + "[nltk_data] | Downloading package pil to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pil.zip.\n", + "[nltk_data] | Downloading package pl196x to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pl196x.zip.\n", + "[nltk_data] | Downloading package porter_test to /root/nltk_data...\n", + "[nltk_data] | Unzipping stemmers/porter_test.zip.\n", + "[nltk_data] | Downloading package ppattach to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ppattach.zip.\n", + "[nltk_data] | Downloading package problem_reports to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/problem_reports.zip.\n", + "[nltk_data] | Downloading package product_reviews_1 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/product_reviews_1.zip.\n", + "[nltk_data] | Downloading package product_reviews_2 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/product_reviews_2.zip.\n", + "[nltk_data] | Downloading package propbank to /root/nltk_data...\n", + "[nltk_data] | Downloading package pros_cons to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/pros_cons.zip.\n", + "[nltk_data] | Downloading package ptb to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ptb.zip.\n", + "[nltk_data] | Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] | Unzipping tokenizers/punkt.zip.\n", + "[nltk_data] | Downloading package qc to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/qc.zip.\n", + "[nltk_data] | Downloading package reuters to /root/nltk_data...\n", + "[nltk_data] | Downloading package rslp to /root/nltk_data...\n", + "[nltk_data] | Unzipping stemmers/rslp.zip.\n", + "[nltk_data] | Downloading package rte to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/rte.zip.\n", + "[nltk_data] | Downloading package sample_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/sample_grammars.zip.\n", + "[nltk_data] | Downloading package semcor to /root/nltk_data...\n", + "[nltk_data] | Downloading package senseval to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/senseval.zip.\n", + "[nltk_data] | Downloading package sentence_polarity to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sentence_polarity.zip.\n", + "[nltk_data] | Downloading package sentiwordnet to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sentiwordnet.zip.\n", + "[nltk_data] | Downloading package shakespeare to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/shakespeare.zip.\n", + "[nltk_data] | Downloading package sinica_treebank to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/sinica_treebank.zip.\n", + "[nltk_data] | Downloading package smultron to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/smultron.zip.\n", + "[nltk_data] | Downloading package snowball_data to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package spanish_grammars to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping grammars/spanish_grammars.zip.\n", + "[nltk_data] | Downloading package state_union to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/state_union.zip.\n", + "[nltk_data] | Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/stopwords.zip.\n", + "[nltk_data] | Downloading package subjectivity to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/subjectivity.zip.\n", + "[nltk_data] | Downloading package swadesh to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/swadesh.zip.\n", + "[nltk_data] | Downloading package switchboard to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/switchboard.zip.\n", + "[nltk_data] | Downloading package tagsets to /root/nltk_data...\n", + "[nltk_data] | Unzipping help/tagsets.zip.\n", + "[nltk_data] | Downloading package timit to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/timit.zip.\n", + "[nltk_data] | Downloading package toolbox to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/toolbox.zip.\n", + "[nltk_data] | Downloading package treebank to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/treebank.zip.\n", + "[nltk_data] | Downloading package twitter_samples to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/twitter_samples.zip.\n", + "[nltk_data] | Downloading package udhr to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/udhr.zip.\n", + "[nltk_data] | Downloading package udhr2 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/udhr2.zip.\n", + "[nltk_data] | Downloading package unicode_samples to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/unicode_samples.zip.\n", + "[nltk_data] | Downloading package universal_tagset to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping taggers/universal_tagset.zip.\n", + "[nltk_data] | Downloading package universal_treebanks_v20 to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package vader_lexicon to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Downloading package verbnet to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/verbnet.zip.\n", + "[nltk_data] | Downloading package verbnet3 to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/verbnet3.zip.\n", + "[nltk_data] | Downloading package webtext to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/webtext.zip.\n", + "[nltk_data] | Downloading package wmt15_eval to /root/nltk_data...\n", + "[nltk_data] | Unzipping models/wmt15_eval.zip.\n", + "[nltk_data] | Downloading package word2vec_sample to\n", + "[nltk_data] | /root/nltk_data...\n", + "[nltk_data] | Unzipping models/word2vec_sample.zip.\n", + "[nltk_data] | Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] | Downloading package wordnet2021 to /root/nltk_data...\n", + "[nltk_data] | Downloading package wordnet31 to /root/nltk_data...\n", + "[nltk_data] | Downloading package wordnet_ic to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/wordnet_ic.zip.\n", + "[nltk_data] | Downloading package words to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/words.zip.\n", + "[nltk_data] | Downloading package ycoe to /root/nltk_data...\n", + "[nltk_data] | Unzipping corpora/ycoe.zip.\n", + "[nltk_data] | \n", + "[nltk_data] Done downloading collection all\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "paragraph = \"\"\"\n", + "Looking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\" As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips. I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar's Palace in Lake Tahoe.\"\n", + "\"The day I picked my dog up from the pound was one of the happiest days of both of our lives. I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy. Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening. I knew within minutes of walking in the door that I would get a puppy… but it wasn't until I saw him that I knew I had found my puppy.\"\n", + "\"Looking for houses was supposed to be a fun and exciting process. \"\"\" " + ], + "metadata": { + "id": "D9AVXZbOD90K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences=nltk.sent_tokenize(paragraph)" + ], + "metadata": { + "id": "02P9DKMPEgpn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ie-3XElKEpQg", + "outputId": "e6de2fac-35a9-46ea-dbf1-5adcb2d28515" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n", + " 'As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips.',\n", + " 'I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar\\'s Palace in Lake Tahoe.\"',\n", + " '\"The day I picked my dog up from the pound was one of the happiest days of both of our lives.',\n", + " 'I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy.',\n", + " 'Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n", + " 'I knew within minutes of walking in the door that I would get a puppy… but it wasn\\'t until I saw him that I knew I had found my puppy.\"',\n", + " '\"Looking for houses was supposed to be a fun and exciting process.']" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "words=nltk.word_tokenize(paragraph)" + ], + "metadata": { + "id": "KpAOneHUEsLh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "words" + ], + "metadata": { + "id": "4T8lkbttEzis" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Stemming" + ], + "metadata": { + "id": "Lf7SZN19E28b" + } + }, + { + "cell_type": "code", + "source": [ + "sentences = nltk.sent_tokenize(paragraph)" + ], + "metadata": { + "id": "aw7iois5E0Qx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from nltk.stem import PorterStemmer\n", + "from nltk.corpus import stopwords\n", + "stemmer=PorterStemmer()" + ], + "metadata": { + "id": "lmXTw0I0E7Ka" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "stopwords.words('english')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "U4_OpnzaFNgv", + "outputId": "3b832b64-7510-41a7-aa45-9d403bee48ba" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['i',\n", + " 'me',\n", + " 'my',\n", + " 'myself',\n", + " 'we',\n", + " 'our',\n", + " 'ours',\n", + " 'ourselves',\n", + " 'you',\n", + " \"you're\",\n", + " \"you've\",\n", + " \"you'll\",\n", + " \"you'd\",\n", + " 'your',\n", + " 'yours',\n", + " 'yourself',\n", + " 'yourselves',\n", + " 'he',\n", + " 'him',\n", + " 'his',\n", + " 'himself',\n", + " 'she',\n", + " \"she's\",\n", + " 'her',\n", + " 'hers',\n", + " 'herself',\n", + " 'it',\n", + " \"it's\",\n", + " 'its',\n", + " 'itself',\n", + " 'they',\n", + " 'them',\n", + " 'their',\n", + " 'theirs',\n", + " 'themselves',\n", + " 'what',\n", + " 'which',\n", + " 'who',\n", + " 'whom',\n", + " 'this',\n", + " 'that',\n", + " \"that'll\",\n", + " 'these',\n", + " 'those',\n", + " 'am',\n", + " 'is',\n", + " 'are',\n", + " 'was',\n", + " 'were',\n", + " 'be',\n", + " 'been',\n", + " 'being',\n", + " 'have',\n", + " 'has',\n", + " 'had',\n", + " 'having',\n", + " 'do',\n", + " 'does',\n", + " 'did',\n", + " 'doing',\n", + " 'a',\n", + " 'an',\n", + " 'the',\n", + " 'and',\n", + " 'but',\n", + " 'if',\n", + " 'or',\n", + " 'because',\n", + " 'as',\n", + " 'until',\n", + " 'while',\n", + " 'of',\n", + " 'at',\n", + " 'by',\n", + " 'for',\n", + " 'with',\n", + " 'about',\n", + " 'against',\n", + " 'between',\n", + " 'into',\n", + " 'through',\n", + " 'during',\n", + " 'before',\n", + " 'after',\n", + " 'above',\n", + " 'below',\n", + " 'to',\n", + " 'from',\n", + " 'up',\n", + " 'down',\n", + " 'in',\n", + " 'out',\n", + " 'on',\n", + " 'off',\n", + " 'over',\n", + " 'under',\n", + " 'again',\n", + " 'further',\n", + " 'then',\n", + " 'once',\n", + " 'here',\n", + " 'there',\n", + " 'when',\n", + " 'where',\n", + " 'why',\n", + " 'how',\n", + " 'all',\n", + " 'any',\n", + " 'both',\n", + " 'each',\n", + " 'few',\n", + " 'more',\n", + " 'most',\n", + " 'other',\n", + " 'some',\n", + " 'such',\n", + " 'no',\n", + " 'nor',\n", + " 'not',\n", + " 'only',\n", + " 'own',\n", + " 'same',\n", + " 'so',\n", + " 'than',\n", + " 'too',\n", + " 'very',\n", + " 's',\n", + " 't',\n", + " 'can',\n", + " 'will',\n", + " 'just',\n", + " 'don',\n", + " \"don't\",\n", + " 'should',\n", + " \"should've\",\n", + " 'now',\n", + " 'd',\n", + " 'll',\n", + " 'm',\n", + " 'o',\n", + " 're',\n", + " 've',\n", + " 'y',\n", + " 'ain',\n", + " 'aren',\n", + " \"aren't\",\n", + " 'couldn',\n", + " \"couldn't\",\n", + " 'didn',\n", + " \"didn't\",\n", + " 'doesn',\n", + " \"doesn't\",\n", + " 'hadn',\n", + " \"hadn't\",\n", + " 'hasn',\n", + " \"hasn't\",\n", + " 'haven',\n", + " \"haven't\",\n", + " 'isn',\n", + " \"isn't\",\n", + " 'ma',\n", + " 'mightn',\n", + " \"mightn't\",\n", + " 'mustn',\n", + " \"mustn't\",\n", + " 'needn',\n", + " \"needn't\",\n", + " 'shan',\n", + " \"shan't\",\n", + " 'shouldn',\n", + " \"shouldn't\",\n", + " 'wasn',\n", + " \"wasn't\",\n", + " 'weren',\n", + " \"weren't\",\n", + " 'won',\n", + " \"won't\",\n", + " 'wouldn',\n", + " \"wouldn't\"]" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bfInToe-FeF_", + "outputId": "9c9b13af-5a22-4214-a349-cb0ee794a850" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n", + " 'As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips.',\n", + " 'I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar\\'s Palace in Lake Tahoe.\"',\n", + " '\"The day I picked my dog up from the pound was one of the happiest days of both of our lives.',\n", + " 'I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy.',\n", + " 'Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n", + " 'I knew within minutes of walking in the door that I would get a puppy… but it wasn\\'t until I saw him that I knew I had found my puppy.\"',\n", + " '\"Looking for houses was supposed to be a fun and exciting process.']" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Stemming\n", + "for i in range(len(sentences)):\n", + " words = nltk.word_tokenize(sentences[i])\n", + " words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]\n", + " sentences[i] = ' '.join(words)" + ], + "metadata": { + "id": "2pqIQUl9FHPC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mmykru4NFZcW", + "outputId": "886dda59-1892-4c99-cf4c-cb4f61a7946f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[\"look back childhood fill event memori , i find rather difficult pick one leav fabl `` warm fuzzi feel . ''\",\n", + " 'as daughter air forc major , i pleasur travel across america mani move trip .',\n", + " \"i visit monstrou tree sequoia nation forest , stood edg grand canyon jump bed caesar 's palac lake taho . ''\",\n", + " '`` the day i pick dog pound one happiest day live .',\n", + " \"i gone pound week earlier idea i would `` look '' puppi .\",\n", + " 'of cours , look squiggl littl face fill hope joy stop sun set even .',\n", + " \"i knew within minut walk door i would get puppy… n't i saw i knew i found puppi . ''\",\n", + " '`` look hous suppos fun excit process .']" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Lemmetization" + ], + "metadata": { + "id": "R1AAV3d0GHSp" + } + }, + { + "cell_type": "code", + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import stopwords\n", + "\n", + "sentences = nltk.sent_tokenize(paragraph)\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "for i in range(len(sentences)):\n", + " words = nltk.word_tokenize(sentences[i])\n", + " words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]\n", + " sentences[i] = ' '.join(words)" + ], + "metadata": { + "id": "F--Uhj7tFxiZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G_1QMDdeGSOb", + "outputId": "cf478d5a-a28c-4e86-add4-f4c39814c843" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[\"Looking back childhood filled event memory , I find rather difficult pick one leaf fabled `` warm fuzzy feeling . ''\",\n", + " 'As daughter Air Force major , I pleasure traveling across America many moving trip .',\n", + " \"I visited monstrous tree Sequoia National Forest , stood edge Grand Canyon jumped bed Caesar 's Palace Lake Tahoe . ''\",\n", + " '`` The day I picked dog pound one happiest day life .',\n", + " \"I gone pound week earlier idea I would `` look '' puppy .\",\n", + " 'Of course , look squiggling little face filled hope joy stop sun setting evening .',\n", + " \"I knew within minute walking door I would get puppy… n't I saw I knew I found puppy . ''\",\n", + " '`` Looking house supposed fun exciting process .']" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "POS Tagging" + ], + "metadata": { + "id": "3lkNI8b7Gjqu" + } + }, + { + "cell_type": "code", + "source": [ + "sentence = \"A quick brown fox runs over a greedy dog\"\n", + "token = nltk.word_tokenize(sentence)\n", + "token" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hKprfi1MGXgG", + "outputId": "14cad267-df4d-4600-da46-eb891a92a42e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['A', 'quick', 'brown', 'fox', 'runs', 'over', 'a', 'greedy', 'dog']" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "nltk.pos_tag(token)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-O5yFTdRG6Vz", + "outputId": "7d79f04e-83e7-4957-c554-414b57486d25" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('A', 'DT'),\n", + " ('quick', 'JJ'),\n", + " ('brown', 'NN'),\n", + " ('fox', 'NN'),\n", + " ('runs', 'VBZ'),\n", + " ('over', 'IN'),\n", + " ('a', 'DT'),\n", + " ('greedy', 'NN'),\n", + " ('dog', 'NN')]" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# We can get more details about any POS tag using help funciton of NLTK as follows.\n", + "nltk.help.upenn_tagset(\"JJ\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1twd5yS-G9TQ", + "outputId": "531241a4-8283-4d73-a34f-d9092abeacf2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "JJ: adjective or numeral, ordinal\n", + " third ill-mannered pre-war regrettable oiled calamitous first separable\n", + " ectoplasmic battery-powered participatory fourth still-to-be-named\n", + " multilingual multi-disciplinary ...\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "nltk.help.upenn_tagset(\"VBZ\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lbVf80v1HF-5", + "outputId": "19dc06c9-a9ad-4f73-ef18-611bf1e7460a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "VBZ: verb, present tense, 3rd person singular\n", + " bases reconstructs marks mixes displeases seals carps weaves snatches\n", + " slumps stretches authorizes smolders pictures emerges stockpiles\n", + " seduces fizzes uses bolsters slaps speaks pleads ...\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "BAG OF WORDS" + ], + "metadata": { + "id": "x6wAhMyrKtIo" + } + }, + { + "cell_type": "code", + "source": [ + "import re #Regular Expression\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import WordNetLemmatizer" + ], + "metadata": { + "id": "_PrvOiaJHNvA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ps = PorterStemmer()\n", + "wordnet=WordNetLemmatizer()\n", + "sentences = nltk.sent_tokenize(paragraph)\n", + "corpus = []" + ], + "metadata": { + "id": "_Ez-DYGUPDZ7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences[0]+'11212'" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "h6G9qe1tPXlN", + "outputId": "c0f3b138-0caf-473b-e1dd-cdf3b3d15a10" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"11212'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "re.sub('[^a-zA-Z]', ' ', sentences[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "VaStGzq4PZpm", + "outputId": "320c18f3-7aef-44a9-fded-0797ec6a1e12" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "' Looking back on a childhood filled with events and memories I find it rather difficult to pick one that leaves me with the fabled warm and fuzzy feelings '" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "for i in range(len(sentences)):\n", + " review = re.sub('[^a-zA-Z]', ' ', sentences[i])\n", + " review = review.lower()\n", + " review = review.split()\n", + " review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]\n", + " review = ' '.join(review)\n", + " corpus.append(review)" + ], + "metadata": { + "id": "XsuGJtw7PGGq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "corpus" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HmdObQvfPyUU", + "outputId": "3d03fb64-fba5-46e5-9c09-88099479d484" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['looking back childhood filled event memory find rather difficult pick one leaf fabled warm fuzzy feeling',\n", + " 'daughter air force major pleasure traveling across america many moving trip',\n", + " 'visited monstrous tree sequoia national forest stood edge grand canyon jumped bed caesar palace lake tahoe',\n", + " 'day picked dog pound one happiest day life',\n", + " 'gone pound week earlier idea would look puppy',\n", + " 'course look squiggling little face filled hope joy stop sun setting evening',\n", + " 'knew within minute walking door would get puppy saw knew found puppy',\n", + " 'looking house supposed fun exciting process']" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Creating the Bag of Words model\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "cv = CountVectorizer(max_features = 1500)\n", + "X = cv.fit_transform(corpus).toarray()" + ], + "metadata": { + "id": "ectRiysqPzMQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X" + ], + "metadata": { + "id": "MrwQOSesP8eo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E0v7a9BeP85U", + "outputId": "128d5968-342f-423c-d8f7-6224ecb2242f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(8, 79)" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "TF-IDF" + ], + "metadata": { + "id": "UTQhnfJRQImf" + } + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import WordNetLemmatizer" + ], + "metadata": { + "id": "GqcEOcjaQEvE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#ps = PorterStemmer()\n", + "wordnet=WordNetLemmatizer()\n", + "sentences = nltk.sent_tokenize(paragraph)\n", + "corpus = []\n", + "for i in range(len(sentences)):\n", + " review = re.sub('[^a-zA-Z]', ' ', sentences[i])\n", + " review = review.lower()\n", + " review = review.split()\n", + " review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]\n", + " review = ' '.join(review)\n", + " corpus.append(review)" + ], + "metadata": { + "id": "97CExC3IQLZA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Creating TF-IDF Model\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "tfidf=TfidfVectorizer()" + ], + "metadata": { + "id": "fZa067jbQNZs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_tfidf=tfidf.fit_transform(corpus).toarray()" + ], + "metadata": { + "id": "7p3a-r96QQYB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_tfidf" + ], + "metadata": { + "id": "lACUy-n7QRzC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_tfidf.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PA1vkbzKQS-E", + "outputId": "6dda2b2b-6cf5-473c-b164-718881abec6c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(8, 79)" + ] + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Word2Vec" + ], + "metadata": { + "id": "yYrQ9yQQQuCn" + } + }, + { + "cell_type": "code", + "source": [ + "### A king is similar to queen\n", + "### A man is similar to woman\n", + "text = re.sub(r'\\[[0-9]*\\]',' ',paragraph)\n", + "text = re.sub(r'\\s+',' ',text)\n", + "text = text.lower()\n", + "text = re.sub(r'\\d',' ',text)\n", + "text = re.sub(r'\\s+',' ',text)" + ], + "metadata": { + "id": "m5AucxtkQabA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Preparing the dataset\n", + "sentences = nltk.sent_tokenize(text)" + ], + "metadata": { + "id": "SuWK_DUGS-oP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kSLI0sVbTCuW", + "outputId": "f3c818a4-a60d-4a7a-b42a-752406f5501e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[' looking back on a childhood filled with events and memories, i find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n", + " 'as the daughter of an air force major, i had the pleasure of traveling across america in many moving trips.',\n", + " 'i have visited the monstrous trees of the sequoia national forest, stood on the edge of the grand canyon and have jumped on the beds at caesar\\'s palace in lake tahoe.\"',\n", + " '\"the day i picked my dog up from the pound was one of the happiest days of both of our lives.',\n", + " 'i had gone to the pound just a week earlier with the idea that i would just \"look\" at a puppy.',\n", + " 'of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n", + " 'i knew within minutes of walking in the door that i would get a puppy… but it wasn\\'t until i saw him that i knew i had found my puppy.\"',\n", + " '\"looking for houses was supposed to be a fun and exciting process.']" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "code", + "source": [ + "sentences = [nltk.word_tokenize(sentence) for sentence in sentences]" + ], + "metadata": { + "id": "yVOYBPlCTEz-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#StopWord Removal\n", + "for i in range(len(sentences)):\n", + " sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]" + ], + "metadata": { + "id": "D4poc4B4THho" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sentences" + ], + "metadata": { + "id": "V8QdexkOTIy5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from gensim.models import Word2Vec" + ], + "metadata": { + "id": "BwMAygitTKnk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model=Word2Vec(sentences,min_count=1)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mBxyqLEZTUpH", + "outputId": "3d213927-9178-4a51-f91a-81b2636614cd" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "model" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DPiFuI5-Te9K", + "outputId": "28eec034-8600-4a0e-9c45-6ba96512959a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.wv.vocab" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uQl63dqLTgCN", + "outputId": "457ccdcb-b4ba-4083-efe3-5bc2cbef9f37" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{\"''\": ,\n", + " \"'s\": ,\n", + " ',': ,\n", + " '.': ,\n", + " '``': ,\n", + " 'across': ,\n", + " 'air': ,\n", + " 'america': ,\n", + " 'back': ,\n", + " 'beds': ,\n", + " 'caesar': ,\n", + " 'canyon': ,\n", + " 'childhood': ,\n", + " 'course': ,\n", + " 'daughter': ,\n", + " 'day': ,\n", + " 'days': ,\n", + " 'difficult': ,\n", + " 'dog': ,\n", + " 'door': ,\n", + " 'earlier': ,\n", + " 'edge': ,\n", + " 'evening': ,\n", + " 'events': ,\n", + " 'exciting': ,\n", + " 'fabled': ,\n", + " 'faces': ,\n", + " 'feelings': ,\n", + " 'filled': ,\n", + " 'find': ,\n", + " 'force': ,\n", + " 'forest': ,\n", + " 'found': ,\n", + " 'fun': ,\n", + " 'fuzzy': ,\n", + " 'get': ,\n", + " 'gone': ,\n", + " 'grand': ,\n", + " 'happiest': ,\n", + " 'hope': ,\n", + " 'houses': ,\n", + " 'idea': ,\n", + " 'joy': ,\n", + " 'jumped': ,\n", + " 'knew': ,\n", + " 'lake': ,\n", + " 'leaves': ,\n", + " 'little': ,\n", + " 'lives': ,\n", + " 'look': ,\n", + " 'looking': ,\n", + " 'major': ,\n", + " 'many': ,\n", + " 'memories': ,\n", + " 'minutes': ,\n", + " 'monstrous': ,\n", + " 'moving': ,\n", + " \"n't\": ,\n", + " 'national': ,\n", + " 'one': ,\n", + " 'palace': ,\n", + " 'pick': ,\n", + " 'picked': ,\n", + " 'pleasure': ,\n", + " 'pound': ,\n", + " 'process': ,\n", + " 'puppy': ,\n", + " 'puppy…': ,\n", + " 'rather': ,\n", + " 'saw': ,\n", + " 'sequoia': ,\n", + " 'setting': ,\n", + " 'squiggling': ,\n", + " 'stood': ,\n", + " 'stop': ,\n", + " 'sun': ,\n", + " 'supposed': ,\n", + " 'tahoe': ,\n", + " 'traveling': ,\n", + " 'trees': ,\n", + " 'trips': ,\n", + " 'visited': ,\n", + " 'walking': ,\n", + " 'warm': ,\n", + " 'week': ,\n", + " 'within': ,\n", + " 'would': }" + ] + }, + "metadata": {}, + "execution_count": 53 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.wv['within']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "K_3q5caTTkPt", + "outputId": "482e91f2-7e1a-4e1d-dec1-4c7c335b1b32" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([ 9.3364244e-05, -2.0226017e-03, 3.0744560e-03, 2.4722507e-03,\n", + " -1.7916801e-03, 1.1004605e-03, 1.5778342e-03, -3.4744558e-03,\n", + " 2.3985626e-03, 4.4293050e-03, -3.2416270e-03, 9.6469990e-04,\n", + " -1.8468881e-03, -4.5837839e-03, -2.7734184e-04, -4.2819157e-03,\n", + " -3.4303457e-04, -2.3855946e-03, 8.0992520e-04, -1.1062848e-03,\n", + " -3.0107235e-03, -3.3425987e-03, 2.1235049e-03, 2.2391626e-03,\n", + " 3.5790335e-03, -5.0837500e-04, -2.4947856e-04, -3.1816968e-04,\n", + " 2.5805044e-03, -3.9695371e-03, -3.2627376e-04, 3.3404287e-03,\n", + " 3.3210497e-03, -3.7256633e-03, 2.4546732e-03, -3.5926504e-03,\n", + " -3.1259684e-03, 4.1785319e-03, -1.8811250e-03, -3.2083079e-04,\n", + " 1.0983367e-03, 3.0588740e-03, -3.8055759e-03, 1.8654363e-03,\n", + " -2.9959625e-03, 1.9540614e-03, -3.4162696e-03, -2.8583435e-03,\n", + " -4.2043673e-03, 4.3449313e-03, 4.6059112e-03, 3.2427472e-03,\n", + " -2.5208378e-03, -1.8257565e-03, 6.5149547e-04, 4.7284369e-03,\n", + " 4.6374514e-03, -6.3585694e-04, -3.1542520e-03, 3.3707032e-03,\n", + " -1.2445718e-03, -3.5111818e-03, 6.5203488e-04, 1.2171916e-03,\n", + " -2.3727534e-04, -3.1939638e-04, 9.9689921e-04, 2.6938734e-03,\n", + " 4.8971297e-03, 3.5206450e-03, -4.8659677e-03, -1.8277732e-03,\n", + " 2.6473652e-03, 2.9146350e-03, -4.9722218e-03, 2.6932417e-03,\n", + " 2.5721423e-03, 4.2625722e-03, -7.3851732e-04, -3.2395408e-03,\n", + " 1.5004680e-03, -1.8992539e-03, 4.8010377e-03, 4.0566269e-03,\n", + " -1.9251317e-03, -2.0484554e-03, -1.7119809e-03, -4.4474346e-03,\n", + " -2.1356612e-03, -4.4765472e-03, 4.5961127e-04, -2.1204483e-03,\n", + " 2.8737509e-04, -2.6111265e-03, -4.5112278e-03, -1.3529632e-03,\n", + " 2.2771490e-04, -4.9307575e-03, -4.3379571e-03, 1.6518446e-03],\n", + " dtype=float32)" + ] + }, + "metadata": {}, + "execution_count": 57 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.wv.most_similar('national')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PCP3GC49Ttlw", + "outputId": "1299a52e-d719-40f9-9e25-ab5a5de1d60c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('air', 0.2506329417228699),\n", + " ('monstrous', 0.20575261116027832),\n", + " (\"n't\", 0.19816187024116516),\n", + " ('lake', 0.17937466502189636),\n", + " ('pound', 0.17282086610794067),\n", + " ('fun', 0.13929110765457153),\n", + " ('leaves', 0.13837510347366333),\n", + " ('happiest', 0.12096145749092102),\n", + " ('feelings', 0.11812127381563187),\n", + " ('found', 0.10904533416032791)]" + ] + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import gensim.downloader" + ], + "metadata": { + "id": "bdRhM-zpT_JZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(list(gensim.downloader.info()['models'].keys()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yT2Mq4AVUIRk", + "outputId": "7e829b86-82ee-4217-d8ad-e70f0dec54f2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "google_vectors=gensim.downloader.load('glove-twitter-25')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cKkqSBIyUYz_", + "outputId": "6bbfdf8e-7df4-4dfd-a55f-e5c99beb0e66" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[==================================================] 100.0% 104.8/104.8MB downloaded\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "google_vectors.wv.most_similar('twitter')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7NjGT-RVUc2j", + "outputId": "a7fdc59a-4f83-491c-d0e3-ce863b4ef02c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('facebook', 0.9480051398277283),\n", + " ('tweet', 0.9403422474861145),\n", + " ('fb', 0.9342358708381653),\n", + " ('instagram', 0.9104823470115662),\n", + " ('chat', 0.8964964747428894),\n", + " ('hashtag', 0.8885936141014099),\n", + " ('tweets', 0.8878157734870911),\n", + " ('tl', 0.8778461813926697),\n", + " ('link', 0.877821147441864),\n", + " ('internet', 0.8753897547721863)]" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "_sQy5bfdUr0e" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file