From 00a46856abfd43cdb842310dc9b02bfc1ea6e6bd Mon Sep 17 00:00:00 2001
From: Diwakar Gupta <39624018+Diwakar-Gupta@users.noreply.github.com>
Date: Sun, 14 Aug 2022 14:38:11 +0530
Subject: [PATCH] NLP intro
---
22-08-14-NLP/NLP_Intro_April.ipynb | 1681 ++++++++++++++++++++++++++++
1 file changed, 1681 insertions(+)
create mode 100644 22-08-14-NLP/NLP_Intro_April.ipynb
diff --git a/22-08-14-NLP/NLP_Intro_April.ipynb b/22-08-14-NLP/NLP_Intro_April.ipynb
new file mode 100644
index 0000000..5428303
--- /dev/null
+++ b/22-08-14-NLP/NLP_Intro_April.ipynb
@@ -0,0 +1,1681 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "NLP_Intro_April.ipynb",
+ "provenance": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "EtjVlIDUCt7C"
+ },
+ "outputs": [],
+ "source": [
+ "import nltk ## Natural Language ToolKit\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "nltk.download('all')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PpONcgZdD20G",
+ "outputId": "86abe33a-f529-4ca2-f371-fff051d57dfc"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading collection 'all'\n",
+ "[nltk_data] | \n",
+ "[nltk_data] | Downloading package abc to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/abc.zip.\n",
+ "[nltk_data] | Downloading package alpino to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/alpino.zip.\n",
+ "[nltk_data] | Downloading package averaged_perceptron_tagger to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip.\n",
+ "[nltk_data] | Downloading package averaged_perceptron_tagger_ru to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping\n",
+ "[nltk_data] | taggers/averaged_perceptron_tagger_ru.zip.\n",
+ "[nltk_data] | Downloading package basque_grammars to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping grammars/basque_grammars.zip.\n",
+ "[nltk_data] | Downloading package biocreative_ppi to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/biocreative_ppi.zip.\n",
+ "[nltk_data] | Downloading package bllip_wsj_no_aux to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping models/bllip_wsj_no_aux.zip.\n",
+ "[nltk_data] | Downloading package book_grammars to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping grammars/book_grammars.zip.\n",
+ "[nltk_data] | Downloading package brown to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/brown.zip.\n",
+ "[nltk_data] | Downloading package brown_tei to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/brown_tei.zip.\n",
+ "[nltk_data] | Downloading package cess_cat to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/cess_cat.zip.\n",
+ "[nltk_data] | Downloading package cess_esp to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/cess_esp.zip.\n",
+ "[nltk_data] | Downloading package chat80 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/chat80.zip.\n",
+ "[nltk_data] | Downloading package city_database to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/city_database.zip.\n",
+ "[nltk_data] | Downloading package cmudict to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/cmudict.zip.\n",
+ "[nltk_data] | Downloading package comparative_sentences to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/comparative_sentences.zip.\n",
+ "[nltk_data] | Downloading package comtrans to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package conll2000 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/conll2000.zip.\n",
+ "[nltk_data] | Downloading package conll2002 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/conll2002.zip.\n",
+ "[nltk_data] | Downloading package conll2007 to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package crubadan to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/crubadan.zip.\n",
+ "[nltk_data] | Downloading package dependency_treebank to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/dependency_treebank.zip.\n",
+ "[nltk_data] | Downloading package dolch to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/dolch.zip.\n",
+ "[nltk_data] | Downloading package europarl_raw to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/europarl_raw.zip.\n",
+ "[nltk_data] | Downloading package extended_omw to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Downloading package floresta to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/floresta.zip.\n",
+ "[nltk_data] | Downloading package framenet_v15 to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/framenet_v15.zip.\n",
+ "[nltk_data] | Downloading package framenet_v17 to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/framenet_v17.zip.\n",
+ "[nltk_data] | Downloading package gazetteers to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/gazetteers.zip.\n",
+ "[nltk_data] | Downloading package genesis to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/genesis.zip.\n",
+ "[nltk_data] | Downloading package gutenberg to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/gutenberg.zip.\n",
+ "[nltk_data] | Downloading package ieer to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/ieer.zip.\n",
+ "[nltk_data] | Downloading package inaugural to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/inaugural.zip.\n",
+ "[nltk_data] | Downloading package indian to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/indian.zip.\n",
+ "[nltk_data] | Downloading package jeita to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package kimmo to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/kimmo.zip.\n",
+ "[nltk_data] | Downloading package knbc to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package large_grammars to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping grammars/large_grammars.zip.\n",
+ "[nltk_data] | Downloading package lin_thesaurus to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/lin_thesaurus.zip.\n",
+ "[nltk_data] | Downloading package mac_morpho to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/mac_morpho.zip.\n",
+ "[nltk_data] | Downloading package machado to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package masc_tagged to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package maxent_ne_chunker to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip.\n",
+ "[nltk_data] | Downloading package maxent_treebank_pos_tagger to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping taggers/maxent_treebank_pos_tagger.zip.\n",
+ "[nltk_data] | Downloading package moses_sample to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping models/moses_sample.zip.\n",
+ "[nltk_data] | Downloading package movie_reviews to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/movie_reviews.zip.\n",
+ "[nltk_data] | Downloading package mte_teip5 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/mte_teip5.zip.\n",
+ "[nltk_data] | Downloading package mwa_ppdb to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping misc/mwa_ppdb.zip.\n",
+ "[nltk_data] | Downloading package names to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/names.zip.\n",
+ "[nltk_data] | Downloading package nombank.1.0 to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package nonbreaking_prefixes to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/nonbreaking_prefixes.zip.\n",
+ "[nltk_data] | Downloading package nps_chat to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/nps_chat.zip.\n",
+ "[nltk_data] | Downloading package omw to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package omw-1.4 to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package opinion_lexicon to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/opinion_lexicon.zip.\n",
+ "[nltk_data] | Downloading package panlex_swadesh to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Downloading package paradigms to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/paradigms.zip.\n",
+ "[nltk_data] | Downloading package pe08 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/pe08.zip.\n",
+ "[nltk_data] | Downloading package perluniprops to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping misc/perluniprops.zip.\n",
+ "[nltk_data] | Downloading package pil to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/pil.zip.\n",
+ "[nltk_data] | Downloading package pl196x to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/pl196x.zip.\n",
+ "[nltk_data] | Downloading package porter_test to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping stemmers/porter_test.zip.\n",
+ "[nltk_data] | Downloading package ppattach to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/ppattach.zip.\n",
+ "[nltk_data] | Downloading package problem_reports to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/problem_reports.zip.\n",
+ "[nltk_data] | Downloading package product_reviews_1 to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/product_reviews_1.zip.\n",
+ "[nltk_data] | Downloading package product_reviews_2 to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/product_reviews_2.zip.\n",
+ "[nltk_data] | Downloading package propbank to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package pros_cons to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/pros_cons.zip.\n",
+ "[nltk_data] | Downloading package ptb to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/ptb.zip.\n",
+ "[nltk_data] | Downloading package punkt to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping tokenizers/punkt.zip.\n",
+ "[nltk_data] | Downloading package qc to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/qc.zip.\n",
+ "[nltk_data] | Downloading package reuters to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package rslp to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping stemmers/rslp.zip.\n",
+ "[nltk_data] | Downloading package rte to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/rte.zip.\n",
+ "[nltk_data] | Downloading package sample_grammars to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping grammars/sample_grammars.zip.\n",
+ "[nltk_data] | Downloading package semcor to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package senseval to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/senseval.zip.\n",
+ "[nltk_data] | Downloading package sentence_polarity to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/sentence_polarity.zip.\n",
+ "[nltk_data] | Downloading package sentiwordnet to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/sentiwordnet.zip.\n",
+ "[nltk_data] | Downloading package shakespeare to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/shakespeare.zip.\n",
+ "[nltk_data] | Downloading package sinica_treebank to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/sinica_treebank.zip.\n",
+ "[nltk_data] | Downloading package smultron to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/smultron.zip.\n",
+ "[nltk_data] | Downloading package snowball_data to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Downloading package spanish_grammars to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping grammars/spanish_grammars.zip.\n",
+ "[nltk_data] | Downloading package state_union to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/state_union.zip.\n",
+ "[nltk_data] | Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/stopwords.zip.\n",
+ "[nltk_data] | Downloading package subjectivity to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/subjectivity.zip.\n",
+ "[nltk_data] | Downloading package swadesh to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/swadesh.zip.\n",
+ "[nltk_data] | Downloading package switchboard to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/switchboard.zip.\n",
+ "[nltk_data] | Downloading package tagsets to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping help/tagsets.zip.\n",
+ "[nltk_data] | Downloading package timit to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/timit.zip.\n",
+ "[nltk_data] | Downloading package toolbox to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/toolbox.zip.\n",
+ "[nltk_data] | Downloading package treebank to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/treebank.zip.\n",
+ "[nltk_data] | Downloading package twitter_samples to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/twitter_samples.zip.\n",
+ "[nltk_data] | Downloading package udhr to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/udhr.zip.\n",
+ "[nltk_data] | Downloading package udhr2 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/udhr2.zip.\n",
+ "[nltk_data] | Downloading package unicode_samples to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/unicode_samples.zip.\n",
+ "[nltk_data] | Downloading package universal_tagset to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping taggers/universal_tagset.zip.\n",
+ "[nltk_data] | Downloading package universal_treebanks_v20 to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Downloading package vader_lexicon to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Downloading package verbnet to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/verbnet.zip.\n",
+ "[nltk_data] | Downloading package verbnet3 to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/verbnet3.zip.\n",
+ "[nltk_data] | Downloading package webtext to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/webtext.zip.\n",
+ "[nltk_data] | Downloading package wmt15_eval to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping models/wmt15_eval.zip.\n",
+ "[nltk_data] | Downloading package word2vec_sample to\n",
+ "[nltk_data] | /root/nltk_data...\n",
+ "[nltk_data] | Unzipping models/word2vec_sample.zip.\n",
+ "[nltk_data] | Downloading package wordnet to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package wordnet2021 to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package wordnet31 to /root/nltk_data...\n",
+ "[nltk_data] | Downloading package wordnet_ic to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/wordnet_ic.zip.\n",
+ "[nltk_data] | Downloading package words to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/words.zip.\n",
+ "[nltk_data] | Downloading package ycoe to /root/nltk_data...\n",
+ "[nltk_data] | Unzipping corpora/ycoe.zip.\n",
+ "[nltk_data] | \n",
+ "[nltk_data] Done downloading collection all\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 2
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "paragraph = \"\"\"\n",
+ "Looking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\" As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips. I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar's Palace in Lake Tahoe.\"\n",
+ "\"The day I picked my dog up from the pound was one of the happiest days of both of our lives. I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy. Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening. I knew within minutes of walking in the door that I would get a puppy… but it wasn't until I saw him that I knew I had found my puppy.\"\n",
+ "\"Looking for houses was supposed to be a fun and exciting process. \"\"\" "
+ ],
+ "metadata": {
+ "id": "D9AVXZbOD90K"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences=nltk.sent_tokenize(paragraph)"
+ ],
+ "metadata": {
+ "id": "02P9DKMPEgpn"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Ie-3XElKEpQg",
+ "outputId": "e6de2fac-35a9-46ea-dbf1-5adcb2d28515"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n",
+ " 'As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips.',\n",
+ " 'I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar\\'s Palace in Lake Tahoe.\"',\n",
+ " '\"The day I picked my dog up from the pound was one of the happiest days of both of our lives.',\n",
+ " 'I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy.',\n",
+ " 'Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n",
+ " 'I knew within minutes of walking in the door that I would get a puppy… but it wasn\\'t until I saw him that I knew I had found my puppy.\"',\n",
+ " '\"Looking for houses was supposed to be a fun and exciting process.']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "words=nltk.word_tokenize(paragraph)"
+ ],
+ "metadata": {
+ "id": "KpAOneHUEsLh"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "words"
+ ],
+ "metadata": {
+ "id": "4T8lkbttEzis"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Stemming"
+ ],
+ "metadata": {
+ "id": "Lf7SZN19E28b"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences = nltk.sent_tokenize(paragraph)"
+ ],
+ "metadata": {
+ "id": "aw7iois5E0Qx"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from nltk.stem import PorterStemmer\n",
+ "from nltk.corpus import stopwords\n",
+ "stemmer=PorterStemmer()"
+ ],
+ "metadata": {
+ "id": "lmXTw0I0E7Ka"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "stopwords.words('english')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "U4_OpnzaFNgv",
+ "outputId": "3b832b64-7510-41a7-aa45-9d403bee48ba"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['i',\n",
+ " 'me',\n",
+ " 'my',\n",
+ " 'myself',\n",
+ " 'we',\n",
+ " 'our',\n",
+ " 'ours',\n",
+ " 'ourselves',\n",
+ " 'you',\n",
+ " \"you're\",\n",
+ " \"you've\",\n",
+ " \"you'll\",\n",
+ " \"you'd\",\n",
+ " 'your',\n",
+ " 'yours',\n",
+ " 'yourself',\n",
+ " 'yourselves',\n",
+ " 'he',\n",
+ " 'him',\n",
+ " 'his',\n",
+ " 'himself',\n",
+ " 'she',\n",
+ " \"she's\",\n",
+ " 'her',\n",
+ " 'hers',\n",
+ " 'herself',\n",
+ " 'it',\n",
+ " \"it's\",\n",
+ " 'its',\n",
+ " 'itself',\n",
+ " 'they',\n",
+ " 'them',\n",
+ " 'their',\n",
+ " 'theirs',\n",
+ " 'themselves',\n",
+ " 'what',\n",
+ " 'which',\n",
+ " 'who',\n",
+ " 'whom',\n",
+ " 'this',\n",
+ " 'that',\n",
+ " \"that'll\",\n",
+ " 'these',\n",
+ " 'those',\n",
+ " 'am',\n",
+ " 'is',\n",
+ " 'are',\n",
+ " 'was',\n",
+ " 'were',\n",
+ " 'be',\n",
+ " 'been',\n",
+ " 'being',\n",
+ " 'have',\n",
+ " 'has',\n",
+ " 'had',\n",
+ " 'having',\n",
+ " 'do',\n",
+ " 'does',\n",
+ " 'did',\n",
+ " 'doing',\n",
+ " 'a',\n",
+ " 'an',\n",
+ " 'the',\n",
+ " 'and',\n",
+ " 'but',\n",
+ " 'if',\n",
+ " 'or',\n",
+ " 'because',\n",
+ " 'as',\n",
+ " 'until',\n",
+ " 'while',\n",
+ " 'of',\n",
+ " 'at',\n",
+ " 'by',\n",
+ " 'for',\n",
+ " 'with',\n",
+ " 'about',\n",
+ " 'against',\n",
+ " 'between',\n",
+ " 'into',\n",
+ " 'through',\n",
+ " 'during',\n",
+ " 'before',\n",
+ " 'after',\n",
+ " 'above',\n",
+ " 'below',\n",
+ " 'to',\n",
+ " 'from',\n",
+ " 'up',\n",
+ " 'down',\n",
+ " 'in',\n",
+ " 'out',\n",
+ " 'on',\n",
+ " 'off',\n",
+ " 'over',\n",
+ " 'under',\n",
+ " 'again',\n",
+ " 'further',\n",
+ " 'then',\n",
+ " 'once',\n",
+ " 'here',\n",
+ " 'there',\n",
+ " 'when',\n",
+ " 'where',\n",
+ " 'why',\n",
+ " 'how',\n",
+ " 'all',\n",
+ " 'any',\n",
+ " 'both',\n",
+ " 'each',\n",
+ " 'few',\n",
+ " 'more',\n",
+ " 'most',\n",
+ " 'other',\n",
+ " 'some',\n",
+ " 'such',\n",
+ " 'no',\n",
+ " 'nor',\n",
+ " 'not',\n",
+ " 'only',\n",
+ " 'own',\n",
+ " 'same',\n",
+ " 'so',\n",
+ " 'than',\n",
+ " 'too',\n",
+ " 'very',\n",
+ " 's',\n",
+ " 't',\n",
+ " 'can',\n",
+ " 'will',\n",
+ " 'just',\n",
+ " 'don',\n",
+ " \"don't\",\n",
+ " 'should',\n",
+ " \"should've\",\n",
+ " 'now',\n",
+ " 'd',\n",
+ " 'll',\n",
+ " 'm',\n",
+ " 'o',\n",
+ " 're',\n",
+ " 've',\n",
+ " 'y',\n",
+ " 'ain',\n",
+ " 'aren',\n",
+ " \"aren't\",\n",
+ " 'couldn',\n",
+ " \"couldn't\",\n",
+ " 'didn',\n",
+ " \"didn't\",\n",
+ " 'doesn',\n",
+ " \"doesn't\",\n",
+ " 'hadn',\n",
+ " \"hadn't\",\n",
+ " 'hasn',\n",
+ " \"hasn't\",\n",
+ " 'haven',\n",
+ " \"haven't\",\n",
+ " 'isn',\n",
+ " \"isn't\",\n",
+ " 'ma',\n",
+ " 'mightn',\n",
+ " \"mightn't\",\n",
+ " 'mustn',\n",
+ " \"mustn't\",\n",
+ " 'needn',\n",
+ " \"needn't\",\n",
+ " 'shan',\n",
+ " \"shan't\",\n",
+ " 'shouldn',\n",
+ " \"shouldn't\",\n",
+ " 'wasn',\n",
+ " \"wasn't\",\n",
+ " 'weren',\n",
+ " \"weren't\",\n",
+ " 'won',\n",
+ " \"won't\",\n",
+ " 'wouldn',\n",
+ " \"wouldn't\"]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bfInToe-FeF_",
+ "outputId": "9c9b13af-5a22-4214-a349-cb0ee794a850"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n",
+ " 'As the daughter of an Air Force major, I had the pleasure of traveling across America in many moving trips.',\n",
+ " 'I have visited the monstrous trees of the Sequoia National Forest, stood on the edge of the Grand Canyon and have jumped on the beds at Caesar\\'s Palace in Lake Tahoe.\"',\n",
+ " '\"The day I picked my dog up from the pound was one of the happiest days of both of our lives.',\n",
+ " 'I had gone to the pound just a week earlier with the idea that I would just \"look\" at a puppy.',\n",
+ " 'Of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n",
+ " 'I knew within minutes of walking in the door that I would get a puppy… but it wasn\\'t until I saw him that I knew I had found my puppy.\"',\n",
+ " '\"Looking for houses was supposed to be a fun and exciting process.']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Stemming\n",
+ "for i in range(len(sentences)):\n",
+ " words = nltk.word_tokenize(sentences[i])\n",
+ " words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]\n",
+ " sentences[i] = ' '.join(words)"
+ ],
+ "metadata": {
+ "id": "2pqIQUl9FHPC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mmykru4NFZcW",
+ "outputId": "886dda59-1892-4c99-cf4c-cb4f61a7946f"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[\"look back childhood fill event memori , i find rather difficult pick one leav fabl `` warm fuzzi feel . ''\",\n",
+ " 'as daughter air forc major , i pleasur travel across america mani move trip .',\n",
+ " \"i visit monstrou tree sequoia nation forest , stood edg grand canyon jump bed caesar 's palac lake taho . ''\",\n",
+ " '`` the day i pick dog pound one happiest day live .',\n",
+ " \"i gone pound week earlier idea i would `` look '' puppi .\",\n",
+ " 'of cours , look squiggl littl face fill hope joy stop sun set even .',\n",
+ " \"i knew within minut walk door i would get puppy… n't i saw i knew i found puppi . ''\",\n",
+ " '`` look hous suppos fun excit process .']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Lemmetization"
+ ],
+ "metadata": {
+ "id": "R1AAV3d0GHSp"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.corpus import stopwords\n",
+ "\n",
+ "sentences = nltk.sent_tokenize(paragraph)\n",
+ "lemmatizer = WordNetLemmatizer()\n",
+ "\n",
+ "for i in range(len(sentences)):\n",
+ " words = nltk.word_tokenize(sentences[i])\n",
+ " words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]\n",
+ " sentences[i] = ' '.join(words)"
+ ],
+ "metadata": {
+ "id": "F--Uhj7tFxiZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "G_1QMDdeGSOb",
+ "outputId": "cf478d5a-a28c-4e86-add4-f4c39814c843"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[\"Looking back childhood filled event memory , I find rather difficult pick one leaf fabled `` warm fuzzy feeling . ''\",\n",
+ " 'As daughter Air Force major , I pleasure traveling across America many moving trip .',\n",
+ " \"I visited monstrous tree Sequoia National Forest , stood edge Grand Canyon jumped bed Caesar 's Palace Lake Tahoe . ''\",\n",
+ " '`` The day I picked dog pound one happiest day life .',\n",
+ " \"I gone pound week earlier idea I would `` look '' puppy .\",\n",
+ " 'Of course , look squiggling little face filled hope joy stop sun setting evening .',\n",
+ " \"I knew within minute walking door I would get puppy… n't I saw I knew I found puppy . ''\",\n",
+ " '`` Looking house supposed fun exciting process .']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "POS Tagging"
+ ],
+ "metadata": {
+ "id": "3lkNI8b7Gjqu"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentence = \"A quick brown fox runs over a greedy dog\"\n",
+ "token = nltk.word_tokenize(sentence)\n",
+ "token"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "hKprfi1MGXgG",
+ "outputId": "14cad267-df4d-4600-da46-eb891a92a42e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['A', 'quick', 'brown', 'fox', 'runs', 'over', 'a', 'greedy', 'dog']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "nltk.pos_tag(token)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "-O5yFTdRG6Vz",
+ "outputId": "7d79f04e-83e7-4957-c554-414b57486d25"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[('A', 'DT'),\n",
+ " ('quick', 'JJ'),\n",
+ " ('brown', 'NN'),\n",
+ " ('fox', 'NN'),\n",
+ " ('runs', 'VBZ'),\n",
+ " ('over', 'IN'),\n",
+ " ('a', 'DT'),\n",
+ " ('greedy', 'NN'),\n",
+ " ('dog', 'NN')]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# We can get more details about any POS tag using help funciton of NLTK as follows.\n",
+ "nltk.help.upenn_tagset(\"JJ\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1twd5yS-G9TQ",
+ "outputId": "531241a4-8283-4d73-a34f-d9092abeacf2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "JJ: adjective or numeral, ordinal\n",
+ " third ill-mannered pre-war regrettable oiled calamitous first separable\n",
+ " ectoplasmic battery-powered participatory fourth still-to-be-named\n",
+ " multilingual multi-disciplinary ...\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "nltk.help.upenn_tagset(\"VBZ\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lbVf80v1HF-5",
+ "outputId": "19dc06c9-a9ad-4f73-ef18-611bf1e7460a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "VBZ: verb, present tense, 3rd person singular\n",
+ " bases reconstructs marks mixes displeases seals carps weaves snatches\n",
+ " slumps stretches authorizes smolders pictures emerges stockpiles\n",
+ " seduces fizzes uses bolsters slaps speaks pleads ...\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "BAG OF WORDS"
+ ],
+ "metadata": {
+ "id": "x6wAhMyrKtIo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re #Regular Expression\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem.porter import PorterStemmer\n",
+ "from nltk.stem import WordNetLemmatizer"
+ ],
+ "metadata": {
+ "id": "_PrvOiaJHNvA"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ps = PorterStemmer()\n",
+ "wordnet=WordNetLemmatizer()\n",
+ "sentences = nltk.sent_tokenize(paragraph)\n",
+ "corpus = []"
+ ],
+ "metadata": {
+ "id": "_Ez-DYGUPDZ7"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences[0]+'11212'"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 53
+ },
+ "id": "h6G9qe1tPXlN",
+ "outputId": "c0f3b138-0caf-473b-e1dd-cdf3b3d15a10"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "'\\nLooking back on a childhood filled with events and memories, I find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"11212'"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {},
+ "execution_count": 31
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "re.sub('[^a-zA-Z]', ' ', sentences[0])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 53
+ },
+ "id": "VaStGzq4PZpm",
+ "outputId": "320c18f3-7aef-44a9-fded-0797ec6a1e12"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "' Looking back on a childhood filled with events and memories I find it rather difficult to pick one that leaves me with the fabled warm and fuzzy feelings '"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ }
+ },
+ "metadata": {},
+ "execution_count": 32
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "for i in range(len(sentences)):\n",
+ " review = re.sub('[^a-zA-Z]', ' ', sentences[i])\n",
+ " review = review.lower()\n",
+ " review = review.split()\n",
+ " review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]\n",
+ " review = ' '.join(review)\n",
+ " corpus.append(review)"
+ ],
+ "metadata": {
+ "id": "XsuGJtw7PGGq"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "corpus"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HmdObQvfPyUU",
+ "outputId": "3d03fb64-fba5-46e5-9c09-88099479d484"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['looking back childhood filled event memory find rather difficult pick one leaf fabled warm fuzzy feeling',\n",
+ " 'daughter air force major pleasure traveling across america many moving trip',\n",
+ " 'visited monstrous tree sequoia national forest stood edge grand canyon jumped bed caesar palace lake tahoe',\n",
+ " 'day picked dog pound one happiest day life',\n",
+ " 'gone pound week earlier idea would look puppy',\n",
+ " 'course look squiggling little face filled hope joy stop sun setting evening',\n",
+ " 'knew within minute walking door would get puppy saw knew found puppy',\n",
+ " 'looking house supposed fun exciting process']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 34
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Creating the Bag of Words model\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "cv = CountVectorizer(max_features = 1500)\n",
+ "X = cv.fit_transform(corpus).toarray()"
+ ],
+ "metadata": {
+ "id": "ectRiysqPzMQ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X"
+ ],
+ "metadata": {
+ "id": "MrwQOSesP8eo"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "E0v7a9BeP85U",
+ "outputId": "128d5968-342f-423c-d8f7-6224ecb2242f"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(8, 79)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 37
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "TF-IDF"
+ ],
+ "metadata": {
+ "id": "UTQhnfJRQImf"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem.porter import PorterStemmer\n",
+ "from nltk.stem import WordNetLemmatizer"
+ ],
+ "metadata": {
+ "id": "GqcEOcjaQEvE"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#ps = PorterStemmer()\n",
+ "wordnet=WordNetLemmatizer()\n",
+ "sentences = nltk.sent_tokenize(paragraph)\n",
+ "corpus = []\n",
+ "for i in range(len(sentences)):\n",
+ " review = re.sub('[^a-zA-Z]', ' ', sentences[i])\n",
+ " review = review.lower()\n",
+ " review = review.split()\n",
+ " review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]\n",
+ " review = ' '.join(review)\n",
+ " corpus.append(review)"
+ ],
+ "metadata": {
+ "id": "97CExC3IQLZA"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Creating TF-IDF Model\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "tfidf=TfidfVectorizer()"
+ ],
+ "metadata": {
+ "id": "fZa067jbQNZs"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X_tfidf=tfidf.fit_transform(corpus).toarray()"
+ ],
+ "metadata": {
+ "id": "7p3a-r96QQYB"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X_tfidf"
+ ],
+ "metadata": {
+ "id": "lACUy-n7QRzC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X_tfidf.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PA1vkbzKQS-E",
+ "outputId": "6dda2b2b-6cf5-473c-b164-718881abec6c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(8, 79)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 43
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Word2Vec"
+ ],
+ "metadata": {
+ "id": "yYrQ9yQQQuCn"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "### A king is similar to queen\n",
+ "### A man is similar to woman\n",
+ "text = re.sub(r'\\[[0-9]*\\]',' ',paragraph)\n",
+ "text = re.sub(r'\\s+',' ',text)\n",
+ "text = text.lower()\n",
+ "text = re.sub(r'\\d',' ',text)\n",
+ "text = re.sub(r'\\s+',' ',text)"
+ ],
+ "metadata": {
+ "id": "m5AucxtkQabA"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Preparing the dataset\n",
+ "sentences = nltk.sent_tokenize(text)"
+ ],
+ "metadata": {
+ "id": "SuWK_DUGS-oP"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "kSLI0sVbTCuW",
+ "outputId": "f3c818a4-a60d-4a7a-b42a-752406f5501e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[' looking back on a childhood filled with events and memories, i find it rather difficult to pick one that leaves me with the fabled \"warm and fuzzy feelings.\"',\n",
+ " 'as the daughter of an air force major, i had the pleasure of traveling across america in many moving trips.',\n",
+ " 'i have visited the monstrous trees of the sequoia national forest, stood on the edge of the grand canyon and have jumped on the beds at caesar\\'s palace in lake tahoe.\"',\n",
+ " '\"the day i picked my dog up from the pound was one of the happiest days of both of our lives.',\n",
+ " 'i had gone to the pound just a week earlier with the idea that i would just \"look\" at a puppy.',\n",
+ " 'of course, you can no more just look at those squiggling little faces so filled with hope and joy than you can stop the sun from setting in the evening.',\n",
+ " 'i knew within minutes of walking in the door that i would get a puppy… but it wasn\\'t until i saw him that i knew i had found my puppy.\"',\n",
+ " '\"looking for houses was supposed to be a fun and exciting process.']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 46
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences = [nltk.word_tokenize(sentence) for sentence in sentences]"
+ ],
+ "metadata": {
+ "id": "yVOYBPlCTEz-"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#StopWord Removal\n",
+ "for i in range(len(sentences)):\n",
+ " sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]"
+ ],
+ "metadata": {
+ "id": "D4poc4B4THho"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sentences"
+ ],
+ "metadata": {
+ "id": "V8QdexkOTIy5"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from gensim.models import Word2Vec"
+ ],
+ "metadata": {
+ "id": "BwMAygitTKnk"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model=Word2Vec(sentences,min_count=1)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mBxyqLEZTUpH",
+ "outputId": "3d213927-9178-4a51-f91a-81b2636614cd"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "DPiFuI5-Te9K",
+ "outputId": "28eec034-8600-4a0e-9c45-6ba96512959a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 52
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model.wv.vocab"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "uQl63dqLTgCN",
+ "outputId": "457ccdcb-b4ba-4083-efe3-5bc2cbef9f37"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{\"''\": ,\n",
+ " \"'s\": ,\n",
+ " ',': ,\n",
+ " '.': ,\n",
+ " '``': ,\n",
+ " 'across': ,\n",
+ " 'air': ,\n",
+ " 'america': ,\n",
+ " 'back': ,\n",
+ " 'beds': ,\n",
+ " 'caesar': ,\n",
+ " 'canyon': ,\n",
+ " 'childhood': ,\n",
+ " 'course': ,\n",
+ " 'daughter': ,\n",
+ " 'day': ,\n",
+ " 'days': ,\n",
+ " 'difficult': ,\n",
+ " 'dog': ,\n",
+ " 'door': ,\n",
+ " 'earlier': ,\n",
+ " 'edge': ,\n",
+ " 'evening': ,\n",
+ " 'events': ,\n",
+ " 'exciting': ,\n",
+ " 'fabled': ,\n",
+ " 'faces': ,\n",
+ " 'feelings': ,\n",
+ " 'filled': ,\n",
+ " 'find': ,\n",
+ " 'force': ,\n",
+ " 'forest': ,\n",
+ " 'found': ,\n",
+ " 'fun': ,\n",
+ " 'fuzzy': ,\n",
+ " 'get': ,\n",
+ " 'gone': ,\n",
+ " 'grand': ,\n",
+ " 'happiest': ,\n",
+ " 'hope': ,\n",
+ " 'houses': ,\n",
+ " 'idea': ,\n",
+ " 'joy': ,\n",
+ " 'jumped': ,\n",
+ " 'knew': ,\n",
+ " 'lake': ,\n",
+ " 'leaves': ,\n",
+ " 'little': ,\n",
+ " 'lives': ,\n",
+ " 'look': ,\n",
+ " 'looking': ,\n",
+ " 'major': ,\n",
+ " 'many': ,\n",
+ " 'memories': ,\n",
+ " 'minutes': ,\n",
+ " 'monstrous': ,\n",
+ " 'moving': ,\n",
+ " \"n't\": ,\n",
+ " 'national': ,\n",
+ " 'one': ,\n",
+ " 'palace': ,\n",
+ " 'pick': ,\n",
+ " 'picked': ,\n",
+ " 'pleasure': ,\n",
+ " 'pound': ,\n",
+ " 'process': ,\n",
+ " 'puppy': ,\n",
+ " 'puppy…': ,\n",
+ " 'rather': ,\n",
+ " 'saw': ,\n",
+ " 'sequoia': ,\n",
+ " 'setting': ,\n",
+ " 'squiggling': ,\n",
+ " 'stood': ,\n",
+ " 'stop': ,\n",
+ " 'sun': ,\n",
+ " 'supposed': ,\n",
+ " 'tahoe': ,\n",
+ " 'traveling': ,\n",
+ " 'trees': ,\n",
+ " 'trips': ,\n",
+ " 'visited': ,\n",
+ " 'walking': ,\n",
+ " 'warm': ,\n",
+ " 'week': ,\n",
+ " 'within': ,\n",
+ " 'would': }"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 53
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model.wv['within']"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "K_3q5caTTkPt",
+ "outputId": "482e91f2-7e1a-4e1d-dec1-4c7c335b1b32"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([ 9.3364244e-05, -2.0226017e-03, 3.0744560e-03, 2.4722507e-03,\n",
+ " -1.7916801e-03, 1.1004605e-03, 1.5778342e-03, -3.4744558e-03,\n",
+ " 2.3985626e-03, 4.4293050e-03, -3.2416270e-03, 9.6469990e-04,\n",
+ " -1.8468881e-03, -4.5837839e-03, -2.7734184e-04, -4.2819157e-03,\n",
+ " -3.4303457e-04, -2.3855946e-03, 8.0992520e-04, -1.1062848e-03,\n",
+ " -3.0107235e-03, -3.3425987e-03, 2.1235049e-03, 2.2391626e-03,\n",
+ " 3.5790335e-03, -5.0837500e-04, -2.4947856e-04, -3.1816968e-04,\n",
+ " 2.5805044e-03, -3.9695371e-03, -3.2627376e-04, 3.3404287e-03,\n",
+ " 3.3210497e-03, -3.7256633e-03, 2.4546732e-03, -3.5926504e-03,\n",
+ " -3.1259684e-03, 4.1785319e-03, -1.8811250e-03, -3.2083079e-04,\n",
+ " 1.0983367e-03, 3.0588740e-03, -3.8055759e-03, 1.8654363e-03,\n",
+ " -2.9959625e-03, 1.9540614e-03, -3.4162696e-03, -2.8583435e-03,\n",
+ " -4.2043673e-03, 4.3449313e-03, 4.6059112e-03, 3.2427472e-03,\n",
+ " -2.5208378e-03, -1.8257565e-03, 6.5149547e-04, 4.7284369e-03,\n",
+ " 4.6374514e-03, -6.3585694e-04, -3.1542520e-03, 3.3707032e-03,\n",
+ " -1.2445718e-03, -3.5111818e-03, 6.5203488e-04, 1.2171916e-03,\n",
+ " -2.3727534e-04, -3.1939638e-04, 9.9689921e-04, 2.6938734e-03,\n",
+ " 4.8971297e-03, 3.5206450e-03, -4.8659677e-03, -1.8277732e-03,\n",
+ " 2.6473652e-03, 2.9146350e-03, -4.9722218e-03, 2.6932417e-03,\n",
+ " 2.5721423e-03, 4.2625722e-03, -7.3851732e-04, -3.2395408e-03,\n",
+ " 1.5004680e-03, -1.8992539e-03, 4.8010377e-03, 4.0566269e-03,\n",
+ " -1.9251317e-03, -2.0484554e-03, -1.7119809e-03, -4.4474346e-03,\n",
+ " -2.1356612e-03, -4.4765472e-03, 4.5961127e-04, -2.1204483e-03,\n",
+ " 2.8737509e-04, -2.6111265e-03, -4.5112278e-03, -1.3529632e-03,\n",
+ " 2.2771490e-04, -4.9307575e-03, -4.3379571e-03, 1.6518446e-03],\n",
+ " dtype=float32)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 57
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model.wv.most_similar('national')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PCP3GC49Ttlw",
+ "outputId": "1299a52e-d719-40f9-9e25-ab5a5de1d60c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[('air', 0.2506329417228699),\n",
+ " ('monstrous', 0.20575261116027832),\n",
+ " (\"n't\", 0.19816187024116516),\n",
+ " ('lake', 0.17937466502189636),\n",
+ " ('pound', 0.17282086610794067),\n",
+ " ('fun', 0.13929110765457153),\n",
+ " ('leaves', 0.13837510347366333),\n",
+ " ('happiest', 0.12096145749092102),\n",
+ " ('feelings', 0.11812127381563187),\n",
+ " ('found', 0.10904533416032791)]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 58
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import gensim.downloader"
+ ],
+ "metadata": {
+ "id": "bdRhM-zpT_JZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(list(gensim.downloader.info()['models'].keys()))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yT2Mq4AVUIRk",
+ "outputId": "7e829b86-82ee-4217-d8ad-e70f0dec54f2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "google_vectors=gensim.downloader.load('glove-twitter-25')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cKkqSBIyUYz_",
+ "outputId": "6bbfdf8e-7df4-4dfd-a55f-e5c99beb0e66"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[==================================================] 100.0% 104.8/104.8MB downloaded\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "google_vectors.wv.most_similar('twitter')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7NjGT-RVUc2j",
+ "outputId": "a7fdc59a-4f83-491c-d0e3-ce863b4ef02c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[('facebook', 0.9480051398277283),\n",
+ " ('tweet', 0.9403422474861145),\n",
+ " ('fb', 0.9342358708381653),\n",
+ " ('instagram', 0.9104823470115662),\n",
+ " ('chat', 0.8964964747428894),\n",
+ " ('hashtag', 0.8885936141014099),\n",
+ " ('tweets', 0.8878157734870911),\n",
+ " ('tl', 0.8778461813926697),\n",
+ " ('link', 0.877821147441864),\n",
+ " ('internet', 0.8753897547721863)]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 62
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ ""
+ ],
+ "metadata": {
+ "id": "_sQy5bfdUr0e"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file