-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathTokenizer.java
160 lines (152 loc) · 7.92 KB
/
Tokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
/**
* A class that represents and performs the functions of a Tokenizer
*
* @author David Nguyen
* @since 04/02/2023
* @version 1.0
*/
public class Tokenizer {
// Field that stores the normalized word list generated from a given string
private ArrayList<String> normalizedWordList;
/**
* A constructor that creates a Tokenizer that reads a ‘.txt’ file from the specified file directory and creates a list of
* all normalized words from the file.
* Time Complexity: O(N) where N is the number of words in the given .txt file
*
* @param file Any file to be read from the specified file directory
*/
public Tokenizer(String file) {
// Creates a new instance of ArrayList to store the normalized word list
this.normalizedWordList = new ArrayList<String>();
/* To make sure that we do not leave any exception unchecked (as reading a file may cause an IOException) a try-catch
* block is implemented: */
try {
// Creates a new instance of ArrayList to store the normalized word list
this.normalizedWordList = new ArrayList<String>();
// Creates a new BufferedReader instance & FileReader instance to read the .txt file
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
// Variable that keeps track of every line in the file
String individualLine = new String();
// A loop that reads every line or individual word from the given file and converts them into normalized words
while ((individualLine = bufferedReader.readLine()) != null) {
// It will split the strings so that the resulting words do not contain any \n \t \r or ' ' characters
String[] arrayOfWords = individualLine.split("[ \n\t\r]+");
// A loop that will go through the newly created array to parsed them to the normalized word list
for (int i = 0; i < arrayOfWords.length; i = i + 1) {
// Variable that keeps track of each of every word from the newly created array
String individualWord = arrayOfWords[i];
// Variable that keeps track of each normalized word from the newly created array
String normalizedWord = normalizeStringOrWord(individualWord);
// Variable that keeps track of whether each of the normalized word is empty or not
boolean checkStringEmpty = normalizedWord.length() == 0;
// If it's empty, do NOT add it to the normalized words list
if (checkStringEmpty == true) {
;
}
// Otherwise, add it to the normalized word list
else {
this.normalizedWordList.add(normalizedWord);
}
}
}
bufferedReader.close();
}
/* If any IOException is caught during the execution of the above steps, print the message of the exception and
* its cause. Additionally, terminate the above process. */
catch (IOException exception) {
System.out.println(exception.getMessage());
System.out.println(exception.getCause());
exception.printStackTrace();
}
}
/**
* A constructor that creates a Tokenizer by reading in an input sequence of Strings and parses all normalized words
* from the sequence as described in the class description, still maintaining the same order.
* Time Complexity: O(N) where N is the number of words in the given strings array
*
* @param text Any array of strings to be parsed into the Tokenizer to be normalized
*/
public Tokenizer(String[] text) {
// Creates a new instance of ArrayList to store the normalized word list
this.normalizedWordList = new ArrayList<String>();
/* To make sure that we do not leave any exception unchecked (as reading a file may cause an IOException) a try-catch
* block is implemented: */
try {
// Variable to keep track of the individual lines to be read from the text
String readLines = new String();
// A loop that reads every word or line in the strings array and normalize it
for (int i = 0; i < text.length; i++) {
// The variable will temporarily store the individual lines from the text
readLines = text[i];
// It will split the strings so that the resulting words do not contain any \n \t \r or ' ' characters
String[] splittedString = readLines.split("[\t\r\n ]+");
// A loop that will go through the newly created array to parsed them to the normalized word list
for (int j = 0; j < splittedString.length; j++) {
// Variable that keeps track of each of every word from the newly created array
String individualWord = splittedString[j];
// Variable that keeps track of each normalized word from the newly created array
String normalizedWord = normalizeStringOrWord(individualWord);
// Variable that keeps track of whether each of the normalized word is empty or not
boolean checkStringEmpty = normalizedWord.length() == 0;
// If it's empty, do NOT add it to the normalized words list
if (checkStringEmpty == true) {
;
}
// Otherwise, add it to the normalized word list
else {
this.normalizedWordList.add(normalizedWord);
}
}
}
}
/* If any exception is caught during the execution of the above steps, print the message of the exception and
* its cause. Additionally, terminate the above process. */
catch (Exception exception) {
System.out.println(exception.getMessage());
System.out.println(exception.getCause());
exception.printStackTrace();
}
}
/**
* A method that will return an ArrayList containing all the normalized words parsed when creating this Tokenizer object.
* Time Complexity: O(1)
*
* @return An ArrayList containing all the normalized words parsed when creating this Tokenizer object.
*/
public ArrayList<String> wordList() {
// Variable that keeps track of the normalized words
ArrayList<String> returnList = new ArrayList<String>(this.normalizedWordList);
return returnList;
}
/**
* A helper method that normalize a string or a word.
* Time Complexity: O(N) where N is the total number of characters in the given string
*
* @param string Any given sentence or word to be normalized
* @return The normalized string
*/
private String normalizeStringOrWord(String string) {
// A StringBuilder instance to construct an output, normalized string
StringBuilder output = new StringBuilder();
// A character variable to keeps track of each character in the given string or word
char character;
// A loop that loops through each character of the given string to normalize them
for (int i = 0; i < string.length(); i++) {
character = string.charAt(i);
// If the character is a letter or a digit, normalized it
if (Character.isLetter(character) || Character.isDigit(character)) {
// Variable to keeps track of the lowercased character
char tempChar = Character.toLowerCase(character);
output.append(tempChar);
}
else {
;
}
}
return output.toString();
}
}