forked from kangjianwei/LearningJDK
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RuleBasedBreakIterator.java
1171 lines (1053 loc) · 44.8 KB
/
RuleBasedBreakIterator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
package sun.text;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.MissingResourceException;
/**
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
*
* <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
* and <i>regular expressions.</i></p>
*
* <p>A substitution rule defines a name that can be used in place of an expression. It
* consists of a name, which is a string of characters contained in angle brackets, an equals
* sign, and an expression. (There can be no whitespace on either side of the equals sign.)
* To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
* square brackets. A substitution is visible after its definition, and is filled in using
* simple textual substitution. Substitution definitions can contain other substitutions, as
* long as those substitutions have been defined first. Substitutions are generally used to
* make the regular expressions (which can get quite complex) shorted and easier to read.
* They typically define either character categories or commonly-used subexpressions.</p>
*
* <p>There is one special substitution. If the description defines a substitution
* called "<ignore>", the expression must be a [] expression, and the
* expression defines a set of characters (the "<em>ignore characters</em>") that
* will be transparent to the BreakIterator. A sequence of characters will break the
* same way it would if any ignore characters it contains are taken out. Break
* positions never occur befoer ignore characters.</p>
*
* <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
* defines a sequence of characters to be kept together. With one significant exception, the
* iterator uses a longest-possible-match algorithm when matching text to regular
* expressions. The iterator also treats descriptions containing multiple regular expressions
* as if they were ORed together (i.e., as if they were separated by |).</p>
*
* <p>The special characters recognized by the regular-expression parser are as follows:</p>
*
* <blockquote>
* <table border="1" width="100%">
* <tr>
* <td width="6%">*</td>
* <td width="94%">Specifies that the expression preceding the asterisk may occur any number
* of times (including not at all).</td>
* </tr>
* <tr>
* <td width="6%">{}</td>
* <td width="94%">Encloses a sequence of characters that is optional.</td>
* </tr>
* <tr>
* <td width="6%">()</td>
* <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
* repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
* the ends of expressions containing |.</td>
* </tr>
* <tr>
* <td width="6%">|</td>
* <td width="94%">Separates two alternative sequences of characters. Either one
* sequence or the other, but not both, matches this expression. The | character can
* only occur inside ().</td>
* </tr>
* <tr>
* <td width="6%">.</td>
* <td width="94%">Matches any character.</td>
* </tr>
* <tr>
* <td width="6%">*?</td>
* <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
* when there is overlap between the last group of characters in the expression preceding the
* * and the first group of characters following the *. When there is this kind of
* overlap, * will match the longest sequence of characters that match the expression before
* the *, and *? will match the shortest sequence of characters matching the expression
* before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
* "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
* but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">Specifies a group of alternative characters. A [] expression will
* match any single character that is specified in the [] expression. For more on the
* syntax of [] expressions, see below.</td>
* </tr>
* <tr>
* <td width="6%">/</td>
* <td width="94%">Specifies where the break position should go if text matches this
* expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
* of letters, followed by a run of whitespace, followed by a digit, but the break position
* will actually go before the whitespace). Expressions that don't contain / put the
* break position at the end of the matching text.</td>
* </tr>
* <tr>
* <td width="6%">\</td>
* <td width="94%">Escape character. The \ itself is ignored, but causes the next
* character to be treated as literal character. This has no effect for many
* characters, but for the characters listed above, this deprives them of their special
* meaning. (There are no special escape sequences for Unicode characters, or tabs and
* newlines; these are all handled by a higher-level protocol. In a Java string,
* "\n" will be converted to a literal newline character by the time the
* regular-expression parser sees it. Of course, this means that \ sequences that are
* visible to the regexp parser must be written as \\ when inside a Java string.) All
* characters in the ASCII range except for letters, digits, and control characters are
* reserved characters to the parser and must be preceded by \ even if they currently don't
* mean anything.</td>
* </tr>
* <tr>
* <td width="6%">!</td>
* <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
* parser that this expression specifies the backwards-iteration behavior of the iterator,
* and not its normal iteration behavior. This is generally only used in situations
* where the automatically-generated backwards-iteration brhavior doesn't produce
* satisfactory results and must be supplemented with extra client-specified rules.</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
* <td width="94%">All other characters are treated as literal characters, which must match
* the corresponding character(s) in the text exactly.</td>
* </tr>
* </table>
* </blockquote>
*
* <p>Within a [] expression, a number of other special characters can be used to specify
* groups of characters:</p>
*
* <blockquote>
* <table border="1" width="100%">
* <tr>
* <td width="6%">-</td>
* <td width="94%">Specifies a range of matching characters. For example
* "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
* sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
* language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
* it include accented letters such as a-umlaut.</td>
* </tr>
* <tr>
* <td width="6%">::</td>
* <td width="94%">A pair of colons containing a one- or two-letter code matches all
* characters in the corresponding Unicode category. The two-letter codes are the same
* as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
* matches all currency symbols and all math symbols). Specifying a one-letter code is
* the same as specifying all two-letter codes that begin with that letter (for example,
* "[:L:]" matches all letters, and is equivalent to
* "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
* category code or a single letter that begins a Unicode category code is illegal within
* colons.</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">[] expressions can nest. This has no effect, except when used in
* conjunction with the ^ token.</td>
* </tr>
* <tr>
* <td width="6%">^</td>
* <td width="94%">Excludes the character (or the characters in the [] expression) following
* it from the group of characters. For example, "[a-z^p]" matches all Latin
* lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
* except the Han ideographs.</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
* <td width="94%">All other characters are treated as literal characters. (For
* example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
* </tr>
* </table>
* </blockquote>
*
* <p>For a more complete explanation, see <a
* href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
* For examples, see the resource data (which is annotated).</p>
*
* @author Richard Gillam
*/
/*
* BreakIterator的子类,使用指定的规则文件指导如何解析文本。
*
* 【由于对解析文本的规则文件未知,所以对某些方法及字段的理解还不到位...】
*
* 使用方式参考BreakIterator的测试用例
*/
public class RuleBasedBreakIterator extends BreakIterator {
/**
* A token used as a character-category value to identify ignore characters
*/
protected static final byte IGNORE = -1;
/**
* Magic number for the BreakIterator data file format.
*/
// 校验规则文件用到的魔数
static final byte[] LABEL = {(byte) 'B', (byte) 'I', (byte) 'd', (byte) 'a', (byte) 't', (byte) 'a', (byte) '\0'};
static final int LABEL_LENGTH = LABEL.length;
/**
* Version number of the dictionary that was read in.
*/
// 校验规则文件用到的版本信息
static final byte supportedVersion = 1;
/**
* The state number of the starting state
*/
private static final short START_STATE = 1;
/**
* The state-transition value indicating "stop"
*/
private static final short STOP_STATE = 0;
/**
* An array length of indices for BMP characters
*/
private static final int BMP_INDICES_LENGTH = 512;
/**
* Tables that indexes from character values to character category numbers
*/
/*
* CompactByteArray和SupplementaryCharacterData作为辅助类,
* 用来存放从BreakIterator这类分词器使用的规则文件中提取的Unicode增补字符集类别信息。
*/
private CompactByteArray charCategoryTable = null;
private SupplementaryCharacterData supplementaryCharCategoryTable = null;
/**
* The table of state transitions used for forward iteration
*/
// 状态转换表,用于前向迭代
private short[] stateTable = null;
/**
* The table of state transitions used to sync up the iterator with the text in backwards and random-access iteration
*/
// 状态转换表,用于将迭代器与向后和随机访问迭代中的文本同步
private short[] backwardsStateTable = null;
/**
* A list of flags indicating which states in the state table are accepting ("end") states
*/
// 一个状态标记列表,用来指示状态转换表中哪些状态是accepting ("end")状态
private boolean[] endStates = null;
/**
* A list of flags indicating which states in the state table are lookahead states (states which turn lookahead on and off)
*/
// 一个状态标记列表,用来只是状态转换表中哪些状态是lookahead状态
private boolean[] lookaheadStates = null;
/**
* A table for additional data. May be used by a subclass of RuleBasedBreakIterator.
*/
// 附加数据表。可以由RuleBasedBreakIterator的子类使用。
private byte[] additionalData = null;
/**
* The number of character categories (and, thus, the number of columns in the state tables)
*/
// 字符类别的数量(同时也是状态状态表中的列数)
private int numCategories;
/**
* The character iterator through which this BreakIterator accesses the text
*/
// 通过该字符迭代器访问分词器中关联的文本
private CharacterIterator text = null;
/**
* A CRC32 value of all data in datafile
*/
// 数据文件的循环冗余校验码,用来保证数据的完整性
private long checksum;
private int cachedLastKnownBreak = BreakIterator.DONE;
/**
* Constructs a RuleBasedBreakIterator using the given rule data.
*
* @throws MissingResourceException if the rule data is invalid or corrupted
*/
/*
* 使用给定的规则文件构造RuleBasedBreakIterator,以字符分词器为例:
* 规则文件名ruleFile==CharacterBreakIteratorData,ruleData是从规则文件sun/text/resources/CharacterBreakIteratorData中加载的规则信息。
*/
public RuleBasedBreakIterator(String ruleFile, byte[] ruleData) {
ByteBuffer bb = ByteBuffer.wrap(ruleData);
try {
validateRuleData(ruleFile, bb);
// 使用规则文件中的数据设置状态装换表信息
setupTables(ruleFile, bb);
} catch(BufferUnderflowException bue) {
MissingResourceException e;
e = new MissingResourceException("Corrupted rule data file", ruleFile, "");
e.initCause(bue);
throw e;
}
}
/**
* Returns the current iteration position.
*
* @return The current iteration position.
*/
// 返回游标当前的索引
@Override
public int current() {
return getText().getIndex();
}
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
*
* @return The offset of the beginning of the text.
*/
// 返回整个文本的起点索引,并将游标设置到起点
@Override
public int first() {
CharacterIterator t = getText();
t.first();
return t.getIndex();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
*
* @return The text's past-the-end offset.
*/
// 返回整个文本的终点索引(此处由DONE标记),并将游标设置到终点
@Override
public int last() {
CharacterIterator t = getText();
// I'm not sure why, but t.last() returns the offset of the last character, rather than the past-the-end offset
t.setIndex(t.getEndIndex());
return t.getIndex();
}
/**
* Advances the iterator backwards, to the last boundary preceding this one.
*
* @return The position of the last boundary position preceding this one.
*/
// 返回前一个最近的左边界(当游标位于某个元素中间时,返回的就是当前元素的左边界)
@Override
public int previous() {
// 如果已经在文本的起点,则返回DONE
CharacterIterator text = getText();
if(current() == text.getBeginIndex()) {
return BreakIterator.DONE;
}
// set things up.
// handlePrevious() will back us up to some valid break position before the current position
// (we back our internal iterator up one step to prevent handlePrevious() from returning the current position),
// but not necessarily the last one before where we started
// 返回当前索引
int start = current();
int lastResult = cachedLastKnownBreak;
if(lastResult >= start || lastResult <= BreakIterator.DONE) {
// 返回前一个Unicode符号码点值,且游标值也挪到前一个Unicode符号的起始部位
getPrevious();
lastResult = handlePrevious();
} else {
// it might be better to check if handlePrevious() give us closer safe value but handlePrevious() is slow too So, this has to be done carefully
text.setIndex(lastResult);
}
int result = lastResult;
// iterate forward from the known break position until we pass our starting point.
// The last break position before the starting point is our return value
while(result != BreakIterator.DONE && result < start) {
lastResult = result;
result = handleNext();
}
// set the current iteration position to be the last break position before where we started, and then return that value
text.setIndex(lastResult);
cachedLastKnownBreak = lastResult;
return lastResult;
}
/**
* Advances the iterator to the next boundary position.
*
* @return The position of the first boundary after this one.
*/
// 返回当前索引所在的元素的下一个元素(字符/单词/行/句子)的左边界索引,且游标也要后移。
@Override
public int next() {
return handleNext();
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
*
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
*
* @return The character offset of the boundary position n boundaries away from
* the current one.
*/
// 返回游标前进/后退n个元素后的左边界索引,并将游标挪到相应的新位置。
@Override
public int next(int n) {
// 找到游标的当前索引
int result = current();
while(n > 0) {
result = handleNext();
--n;
}
while(n < 0) {
result = previous();
++n;
}
return result;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
*
* @return The position of the first break after the current position.
*
* @offset The position from which to begin searching for a break position.
*/
// 返回offset索引所在的元素的下一个元素的左边界,游标也会跟着移动。
@Override
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// Set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
text.setIndex(offset);
if(offset == text.getBeginIndex()) {
cachedLastKnownBreak = handleNext();
return cachedLastKnownBreak;
}
// otherwise, we have to sync up first. Use handlePrevious() to back
// us up to a known break position before the specified position (if
// we can determine that the specified position is a break position,
// we don't back up at all). This may or may not be the last break
// position at or before our starting position. Advance forward
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
int result = cachedLastKnownBreak;
if(result >= offset || result <= BreakIterator.DONE) {
result = handlePrevious();
} else {
//it might be better to check if handlePrevious() give us closer
//safe value but handlePrevious() is slow too
//So, this has to be done carefully
text.setIndex(result);
}
while(result != BreakIterator.DONE && result <= offset) {
result = handleNext();
}
cachedLastKnownBreak = result;
return result;
}
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
*
* @return The position of the last boundary before the starting position.
*
* @offset The position to begin searching for a break from.
*/
// 返回offset索引左侧出现的最近一个左边界
@Override
public int preceding(int offset) {
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
CharacterIterator text = getText();
checkOffset(offset, text);
text.setIndex(offset);
return previous();
}
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
*
* @param offset the offset to check.
*
* @return True if "offset" is a boundary position.
*/
// 判断offset索引处是否是一个(左)边界。
@Override
public boolean isBoundary(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
if(offset == text.getBeginIndex()) {
return true;
} else {
// to check whether this is a boundary,
// we can use following() on the position before the specified one and return true if the position we get back is the one the user specified
return following(offset - 1) == offset;
}
}
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
*
* @return An iterator over the text being analyzed.
*/
// 返回分词器关联的文本迭代器
@Override
public CharacterIterator getText() {
// The iterator is initialized pointing to no text at all, so if this
// function is called while we're in that state, we have to fudge an
// iterator to return.
if(text == null) {
text = new StringCharacterIterator("");
}
return text;
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
*
* @param newText An iterator over the text to analyze.
*/
// 为分词器设置文本迭代器
@Override
public void setText(CharacterIterator newText) {
// Test iterator to see if we need to wrap it in a SafeCharIterator.
// The correct behavior for CharacterIterators is to allow the position to be set to the endpoint of the iterator.
// Many CharacterIterators do not uphold this, so this is a workaround to permit them to use this class.
// 返回文本迭代器的终点
int end = newText.getEndIndex();
boolean goodIterator;
try {
// 检测使用的迭代器是否为一个安全的迭代器(意味着游标可以定位到终点)
newText.setIndex(end); // some buggy iterators throw an exception here
goodIterator = newText.getIndex() == end;
} catch(IllegalArgumentException e) {
goodIterator = false;
}
// 如果是一个安全的迭代器,将其设置给分词器
if(goodIterator) {
text = newText;
} else {
// 如果不是一个安全的迭代器,将其包装到“安全迭代器”中
text = new SafeCharIterator(newText);
}
// 重置游标到文本起点
text.first();
// 缓存最近一次已知的边界为DONE
cachedLastKnownBreak = BreakIterator.DONE;
}
/**
* This method is the actual implementation of the next() method.
* All iteration vectors through here.
* This method initializes the state machine to state 1 and advances through the text character by character
* until we reach the end of the text or the state machine transitions to state 0.
* We update our return value every time the state machine passes through a possible end state.
*/
// next方法的具体实现
protected int handleNext() {
// if we're already at the end of the text, return DONE.
CharacterIterator text = getText();
if(text.getIndex() == text.getEndIndex()) {
return BreakIterator.DONE;
}
// no matter what, we always advance at least one character forward
int result = getNextIndex();
int lookaheadResult = 0;
// begin in state 1
int state = START_STATE;
int category;
int c = getCurrent();
// loop until we reach the end of the text or transition to state 0
while(c != CharacterIterator.DONE && state != STOP_STATE) {
// look up the current character's character category (which tells us
// which column in the state table to look at)
category = lookupCategory(c);
// if the character isn't an ignore character, look up a state
// transition in the state table
if(category != IGNORE) {
state = lookupState(state, category);
}
// if the state we've just transitioned to is a lookahead state,
// (but not also an end state), save its position. If it's
// both a lookahead state and an end state, update the break position
// to the last saved lookup-state position
if(lookaheadStates[state]) {
if(endStates[state]) {
result = lookaheadResult;
} else {
lookaheadResult = getNextIndex();
}
}
// otherwise, if the state we've just transitioned to is an accepting
// state, update the break position to be the current iteration position
else {
if(endStates[state]) {
result = getNextIndex();
}
}
c = getNext();
}
// if we've run off the end of the text, and the very last character took us into
// a lookahead state, advance the break position to the lookahead position
// (the theory here is that if there are no characters at all after the lookahead
// position, that always matches the lookahead criteria)
if(c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
result = lookaheadResult;
}
text.setIndex(result);
return result;
}
/**
* This method backs the iterator back up to a "safe position" in the text.
* This is a position that we know, without any context, must be a break position.
* The various calling methods then iterate forward from this safe position to the appropriate position to return.
* (For more information, see the description of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
*/
// 将迭代器游标备份回"安全位置"
protected int handlePrevious() {
CharacterIterator text = getText();
int state = START_STATE;
int category = 0;
int lastCategory = 0;
int c = getCurrent();
// 循环,直到我们到达文本的开头或转换到状态为0
while(c != CharacterIterator.DONE && state != STOP_STATE) {
// save the last character's category and look up the current character's category
lastCategory = category;
category = lookupCategory(c); // 查找字符类别
// if the current character isn't an ignore character, look up a state transition in the backwards state table
// 如果当前字符不是一个可忽略字符,则在backwardsStateTable中查找其状态转换信息
if(category != IGNORE) {
// 给定当前状态和字符类别,在backwardsStateTable中查找要转换到的下一个状态。
state = lookupBackwardState(state, category);
}
// 返回前一个Unicode符号码点值,且游标值也挪到前一个Unicode符号的起始部位
c = getPrevious();
}
// if we didn't march off the beginning of the text, we're either one or two positions away from the real break position.
// (One because of the call to previous() at the end of the loop above,
// and another because the character that takes us into the stop state will always be the character BEFORE the break position.)
if(c != CharacterIterator.DONE) {
if(lastCategory != IGNORE) {
getNext();
getNext();
} else {
getNext();
}
}
return text.getIndex();
}
/**
* Looks up a character's category (i.e., its category for breaking purposes, not its Unicode category)
*/
// 查找字符类别(由规则文件决定)
protected int lookupCategory(int c) {
if(c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
return charCategoryTable.elementAt((char) c);
} else {
return supplementaryCharCategoryTable.getValue(c);
}
}
/**
* Given a current state and a character category, looks up the next state to transition to in the state table.
*/
// 给定当前状态和字符类别,在stateTable中查找要转换到的下一个状态。
protected int lookupState(int state, int category) {
return stateTable[state * numCategories + category];
}
/**
* Given a current state and a character category, looks up the next state to transition to in the backwards state table.
*/
// 给定当前状态和字符类别,在backwardsStateTable中查找要转换到的下一个状态。
protected int lookupBackwardState(int state, int category) {
return backwardsStateTable[state * numCategories + category];
}
/**
* Returns current character
*/
// 返回当前索引处的Unicode符号的编码
int getCurrent() {
char c1 = text.current();
if(Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) {
char c2 = text.next();
text.previous();
if(Character.isLowSurrogate(c2)) {
return Character.toCodePoint(c1, c2);
}
}
return (int) c1;
}
/**
* Returns next character
*/
// 返回下一个索引处的Unicode符号的编码
int getNext() {
int index = text.getIndex();
int endIndex = text.getEndIndex();
if(index == endIndex || (index += getCurrentCodePointCount()) >= endIndex) {
return CharacterIterator.DONE;
}
text.setIndex(index);
return getCurrent();
}
/**
* Returns previous character
*/
// 返回前一个Unicode符号码点值,且游标值也挪到前一个Unicode符号的起始部位
private int getPrevious() {
char c2 = text.previous();
if(Character.isLowSurrogate(c2) && text.getIndex() > text.getBeginIndex()) {
char c1 = text.previous();
if(Character.isHighSurrogate(c1)) {
return Character.toCodePoint(c1, c2);
} else {
text.next();
}
}
return (int) c2;
}
/**
* Returns the count of next character.
*/
// 返回当前索引处的Unicode符号所占的码元(char)个数
private int getCurrentCodePointCount() {
char c1 = text.current();
if(Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) {
char c2 = text.next();
text.previous();
if(Character.isLowSurrogate(c2)) {
return 2;
}
}
return 1;
}
/**
* Returns the position of next character.
*/
// 返回下一个Unicode符号的索引
private int getNextIndex() {
int index = text.getIndex() + getCurrentCodePointCount();
int endIndex = text.getEndIndex();
if(index > endIndex) {
return endIndex;
} else {
return index;
}
}
/**
* Validates the magic number, version, and the length of the given data.
*
* @throws BufferUnderflowException if the end-of-data is reached while validating data
* @throws MissingResourceException if valification failed
*/
// 校验规则文件数据完整性
void validateRuleData(String ruleFile, ByteBuffer bb) {
/* Verify the magic number. */
for(int i = 0; i < LABEL_LENGTH; i++) {
if(bb.get() != LABEL[i]) {
throw new MissingResourceException("Wrong magic number", ruleFile, "");
}
}
/* Verify the version number. */
byte version = bb.get();
if(version != supportedVersion) {
throw new MissingResourceException("Unsupported version(" + version + ")", ruleFile, "");
}
// Check the length of the rest of data
int len = bb.getInt();
if(bb.position() + len != bb.limit()) {
throw new MissingResourceException("Wrong data length", ruleFile, "");
}
}
/**
* Initializes the fields with the given rule data.
* The data format is as follows:
* <pre>
* BreakIteratorData {
* u1 magic[7];
* u1 version;
* u4 totalDataSize;
* header_info header;
* body value;
* }
* </pre>
* <code>totalDataSize</code> is the summation of the size of
* <code>header_info</code> and <code>body</code> in byte count.
* <p>
* In <code>header</code>, each field except for checksum implies the
* length of each field. Since <code>BMPdataLength</code> is a fixed-length
* data(512 entries), its length isn't included in <code>header</code>.
* <code>checksum</code> is a CRC32 value of all in <code>body</code>.
* <pre>
* header_info {
* u4 stateTableLength;
* u4 backwardsStateTableLength;
* u4 endStatesLength;
* u4 lookaheadStatesLength;
* u4 BMPdataLength;
* u4 nonBMPdataLength;
* u4 additionalDataLength;
* u8 checksum;
* }
* </pre>
* <p>
*
* Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
* <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
* <code>supplementaryCharCategoryTable</code>.
* <pre>
* body {
* u2 stateTable[stateTableLength];
* u2 backwardsStateTable[backwardsStateTableLength];
* u1 endStates[endStatesLength];
* u1 lookaheadStates[lookaheadStatesLength];
* u2 BMPindices[512];
* u1 BMPdata[BMPdataLength];
* u4 nonBMPdata[numNonBMPdataLength];
* u1 additionalData[additionalDataLength];
* }
* </pre>
*
* @throws BufferUnderflowException if the end-of-data is reached before
* setting up all the tables
*/
// 使用规则文件中的数据设置状态装换表信息
private void setupTables(String ruleFile, ByteBuffer bb) {
/* Read header_info. */
int stateTableLength = bb.getInt();
int backwardsStateTableLength = bb.getInt();
int endStatesLength = bb.getInt();
int lookaheadStatesLength = bb.getInt();
int BMPdataLength = bb.getInt();
int nonBMPdataLength = bb.getInt();
int additionalDataLength = bb.getInt();
checksum = bb.getLong();
/* Read stateTable[numCategories * numRows] */
stateTable = new short[stateTableLength];
for(int i = 0; i < stateTableLength; i++) {
stateTable[i] = bb.getShort();
}
/* Read backwardsStateTable[numCategories * numRows] */
backwardsStateTable = new short[backwardsStateTableLength];
for(int i = 0; i < backwardsStateTableLength; i++) {
backwardsStateTable[i] = bb.getShort();
}
/* Read endStates[numRows] */
endStates = new boolean[endStatesLength];
for(int i = 0; i < endStatesLength; i++) {
endStates[i] = bb.get() == 1;
}
/* Read lookaheadStates[numRows] */
lookaheadStates = new boolean[lookaheadStatesLength];
for(int i = 0; i < lookaheadStatesLength; i++) {
lookaheadStates[i] = bb.get() == 1;
}
/* Read a category table and indices for BMP characters. */
short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices
for(int i = 0; i < BMP_INDICES_LENGTH; i++) {
temp1[i] = bb.getShort();
}
byte[] temp2 = new byte[BMPdataLength]; // BMPdata
bb.get(temp2);
charCategoryTable = new CompactByteArray(temp1, temp2);
/* Read a category table for non-BMP characters. */
int[] temp3 = new int[nonBMPdataLength];
for(int i = 0; i < nonBMPdataLength; i++) {
temp3[i] = bb.getInt();
}
supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
/* Read additional data */
if(additionalDataLength > 0) {
additionalData = new byte[additionalDataLength];
bb.get(additionalData);
}
assert bb.position() == bb.limit();
/* Set numCategories */
numCategories = stateTable.length / endStates.length;
}
byte[] getAdditionalData() {
return additionalData;
}
void setAdditionalData(byte[] b) {
additionalData = b;
}
/**
* Clones this iterator.
*
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
*/
@Override
public Object clone() {