-
Notifications
You must be signed in to change notification settings - Fork 0
/
PageEntry.java
160 lines (150 loc) · 4.1 KB
/
PageEntry.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import java.util.*;
import java.io.*;
public class PageEntry
{
public PageIndex pageindex;
public String name;
public int wordCount = 0;
public PageEntry()
{
name = "";
pageindex = new PageIndex();
}
public String punctElim(String a)
{
for(int i=0; i<a.length(); i++)
{
if(a.charAt(i)=='{'||a.charAt(i)=='}'||a.charAt(i)=='['||a.charAt(i)==']'||a.charAt(i)=='<'||a.charAt(i)=='>'||a.charAt(i)=='='||a.charAt(i)=='('||a.charAt(i)==')'||a.charAt(i)=='.'||a.charAt(i)==','||a.charAt(i)==';'||a.charAt(i)=='"'||a.charAt(i)=='?'||a.charAt(i)=='#'||a.charAt(i)=='!'||a.charAt(i)=='-'||a.charAt(i)==':'||a.charAt(i)=='\'')
{a=a.substring(0,i)+" "+a.substring(i+1);}
}
return a.toLowerCase();
}
public PageEntry(String p)
{
pageindex = new PageIndex();
name = p;
try
{
String newWord;
int counter = 1;
int phraseCounter = 1;
FileInputStream fstream = new FileInputStream("webpages\\"+p);
Scanner s = new Scanner(fstream);
while(s.hasNextLine())
{
String a = s.nextLine();
String[] arr = punctElim(a).split(" ");
for(int i=0; i<arr.length; i++)
{
newWord = arr[i];
if (newWord.equals("")||newWord.equals(" ")){}
else if (newWord.equals("a")||newWord.equals("an")||newWord.equals("the")||newWord.equals("they")||newWord.equals("these")||newWord.equals("this")||newWord.equals("for")||newWord.equals("is")||newWord.equals("are")||newWord.equals("was")||newWord.equals("of")||newWord.equals("or")||newWord.equals("and")||newWord.equals("does")||newWord.equals("will")||newWord.equals("whose"))
{
counter++;
}
else if (newWord.equals("stacks")||newWord.equals("structures")||newWord.equals("applications"))
{
newWord = newWord.substring(0,newWord.length()-1);
Position pos = new Position(this,counter);
pos.phraseIndex = phraseCounter;
pageindex.addPositionForWord(newWord,pos);
counter++;
phraseCounter++;
}
else
{
Position pos = new Position(this,counter);
pos.phraseIndex = phraseCounter;
pageindex.addPositionForWord(newWord,pos);
counter++;
phraseCounter++;
}
}
}
wordCount = counter - 1;
}
catch(FileNotFoundException e){System.out.println("File "+p+" not found Exception");}
}
public PageIndex getPageIndex()
{
return pageindex;
}
public float getTermFrequency(String word)
{
Node<WordEntry> tempwe = pageindex.wordEntryList.head;
int wordCounter = 0;
while(tempwe!=null)
{
if(tempwe.getElement().word.equals(word))
{
wordCounter = tempwe.getElement().indexList.size();;
}
tempwe = tempwe.getNext();
}
float ans = (float) wordCounter/wordCount;
return ans;
}
public WordEntry findWordEntryFromPage(String w)
{
Node<WordEntry> temp = this.pageindex.wordEntryList.head;
while(temp!=null)
{
if(temp.getElement().word.equals(w))
{
return temp.getElement();
}
temp = temp.getNext();
}
return null;
}
public String modify(String s)
{
if(s.equals("structures")) return("structure");
else if(s.equals("stacks")) return("stack");
else if(s.equals("applications")) return("application");
else return(s);
}
public int pageContainsPhrase(String[] str)
{
Boolean flag;
int count = 0;
WordEntry firstwe = findWordEntryFromPage(modify(str[0]));
WordEntry tempwe;
int check;
if(firstwe==null)
{
return 0;
}
else
{
Vector<Integer> vector = firstwe.indexTree.inorder();
for(int i=0; i<vector.size(); i++)
{
check = vector.get(i);
flag = true;
for(int j=1; j<str.length; j++)
{
check++;
tempwe = findWordEntryFromPage(modify(str[j]));
if(tempwe==null)
{
flag=false;
}
else
{
flag = flag && tempwe.indexTree.search(check);
}
}
if (flag==true)
{
count++;
}
}
//System.out.println("Value of count-"+count);
return count;
}
}
// public float getRelevanceOfPage(String str[], boolean doTheseWordsRepresentAPhrase)
// {
// }
}