-
Notifications
You must be signed in to change notification settings - Fork 3
/
RemoveStopWd.py
73 lines (58 loc) · 2.18 KB
/
RemoveStopWd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Author: Yuanwei Wu
# Date: 3/2/2016, 1st version
# Date: 3/5/2016, 2nd version: add the unicode (line24) fixed the EncoderError
# add str() at line54 to fix TypeError: expected a character buffer object
# Description: IR Project: part 1, Document processing and indexing
# Tokenize, remove stop word, stemming
# using NLTK package
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#from nltk.stem.lancaster import LancasterStemmer
# there are many stemmers in nltk, here I use PorterStemmer
from nltk.stem import PorterStemmer
import os
import codecs # otherwise, it has ascii encoding error
ps = PorterStemmer()
def RemoveStopwdStem(inputfolder, outputfolder, url):
#os.chdir("./cleaned")
init_file = open(inputfolder+"/"+url,'r')#this has unicode error
# init_file = codecs.open(inputfolder+"/"+url,encoding='utf-8')
init_word = init_file.read()
init_word = unicode(init_word,'utf-8')
init_file.close()
# tokenize
tokenized_word = word_tokenize(init_word)
#print(tokenized_word)
#tokenizedword = len(tokenized_word)
#print(tokenizedword)
# remove stop words
stop_words = set(stopwords.words("english"))
removedstop_word = []
removedstop_word = [w for w in tokenized_word if not w in stop_words]
#print(removedstop_word)
#lengthstopword = len(removedstop_word)
#print(lengthstopword)
# stemming
stemmed_words = []
for w in removedstop_word:
stemmed_words.append(ps.stem(w))
#print(stemmed_words)
#stemmedword = len(stemmed_words)
#print(stemmedword)
# There are some problems on the encoding, I need to figure it out.
# mid_file = codecs.open(outputfolder+"/"+url, "w")
mid_file = open(outputfolder+"/"+url+".removed", "w")
mid_file.write(str(stemmed_words))
mid_file.close()
# text_file = codecs.open(outputfolder+"/"+url+".removed", "w")
# text_file = open(outputfolder+"/"+url+".removed", "w")
# # text_file.write(text.encode('utf-8'))
# text_file.close()
# os.remove(outputfolder+"/"+url)
for filename in os.listdir('cleaned/'):
print(filename)
RemoveStopwdStem("cleaned","Removed",filename)
print("Done.")
# test the 1st file in cleaned/
# filename = 'Acadia_National_Park.htm.cleaned'
# RemoveStopwdStem("cleaned","Removed",filename)