From 1d9aacdbc13973393e845283c811005af3beeec6 Mon Sep 17 00:00:00 2001 From: Athrun Saga Date: Sat, 10 Aug 2013 16:07:38 +0800 Subject: [PATCH 1/4] PORT: Analyzers for CN --- src/contrib/Analyzers/Cn/ChineseAnalyzer.cs | 55 +++------------- src/contrib/Analyzers/Cn/ChineseFilter.cs | 39 ++++++------ src/contrib/Analyzers/Cn/ChineseTokenizer.cs | 62 +++++++------------ .../Analyzers/Contrib.Analyzers.csproj | 4 +- 4 files changed, 56 insertions(+), 104 deletions(-) diff --git a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs b/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs index 1ec050a455..f91ca1e25b 100644 --- a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs +++ b/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs @@ -21,10 +21,6 @@ using System; using System.IO; -using System.Text; -using System.Collections; - -using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.Cn { @@ -32,54 +28,21 @@ namespace Lucene.Net.Analysis.Cn /// An that tokenizes text with and /// filters with /// + [Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardAnalyzer} instead, which has the same functionality. This analyzer will be removed in Lucene 5.0")] public class ChineseAnalyzer : Analyzer { - - public ChineseAnalyzer() - { - } - - /// - /// Creates a TokenStream which tokenizes all the text in the provided Reader. - /// - /// A TokenStream build from a ChineseTokenizer filtered with ChineseFilter. - public override sealed TokenStream TokenStream(String fieldName, TextReader reader) - { - TokenStream result = new ChineseTokenizer(reader); - result = new ChineseFilter(result); - return result; - } - - private class SavedStreams - { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - /// - /// Returns a (possibly reused) which tokenizes all the text in the - /// provided . + /// Creates + /// used to tokenize all the text in the provided . /// /// - /// A built from a - /// filtered with . - /// - public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) + /// + /// built from a filtered with + /// + public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - streams.source = new ChineseTokenizer(reader); - streams.result = new ChineseFilter(streams.source); - PreviousTokenStream = streams; - } - else - { - streams.source.Reset(reader); - } - return streams.result; + Tokenizer source = new ChineseTokenizer(reader); + return new TokenStreamComponents(source, new ChineseFilter(source)); } } } diff --git a/src/contrib/Analyzers/Cn/ChineseFilter.cs b/src/contrib/Analyzers/Cn/ChineseFilter.cs index e5c83a5958..9d7e5cdbe7 100644 --- a/src/contrib/Analyzers/Cn/ChineseFilter.cs +++ b/src/contrib/Analyzers/Cn/ChineseFilter.cs @@ -24,31 +24,34 @@ using System.IO; using System.Collections; using System.Globalization; +using System.Linq; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.Analysis.Util; +using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis.Cn { - // TODO: convert this XML code to valid .NET /// - /// A {@link TokenFilter} with a stop word table. - ///
    - ///
  • Numeric tokens are removed.
  • - ///
  • English tokens must be larger than 1 char.
  • - ///
  • One Chinese char as one Chinese word.
  • - ///
+ /// A with a stop word table. + /// + /// Numeric tokens are removed. + /// English tokens must be larger than 1 char. + /// One Chinese char as one Chinese word. + /// /// TO DO: - ///
    - ///
  1. Add Chinese stop words, such as \ue400
  2. - ///
  3. Dictionary based Chinese word extraction
  4. - ///
  5. Intelligent Chinese word extraction
  6. - ///
+ /// + /// Add Chinese stop words, such as \ue400 + /// Dictionary based Chinese word extraction + /// Intelligent Chinese word extraction + /// ///
+ [Obsolete("(3.1) Use {Lucene.Net.Analysis.Core.StopFilter} instead, which has the same functionality. This filter will be removed in Lucene 5.0")] public sealed class ChineseFilter : TokenFilter { // Only English now, Chinese to be added later. - public static String[] STOP_WORDS = + public static readonly String[] STOP_WORDS = { "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", @@ -58,21 +61,21 @@ public sealed class ChineseFilter : TokenFilter }; private CharArraySet stopTable; - private ITermAttribute termAtt; + private ICharTermAttribute termAtt; public ChineseFilter(TokenStream _in) : base(_in) { - stopTable = new CharArraySet((IEnumerable)STOP_WORDS, false); - termAtt = AddAttribute(); + stopTable = new CharArraySet(Version.LUCENE_CURRENT, STOP_WORDS.ToList(), false); + termAtt = AddAttribute(); } public override bool IncrementToken() { while (input.IncrementToken()) { - char[] text = termAtt.TermBuffer(); - int termLength = termAtt.TermLength(); + char[] text = termAtt.Buffer; + int termLength = termAtt.Length; // why not key off token type here assuming ChineseTokenizer comes first? if (!stopTable.Contains(text, 0, termLength)) diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs index 69947aa525..756e692c20 100644 --- a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs +++ b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs @@ -32,52 +32,45 @@ namespace Lucene.Net.Analysis.Cn { /// + /// /// Tokenize Chinese text as individual chinese chars. - ///

+ /// + /// /// The difference between ChineseTokenizer and /// CJKTokenizer is that they have different /// token parsing logic. - ///

- ///

+ /// + /// /// For example, if the Chinese text /// "C1C2C3C4" is to be indexed: - ///

    - ///
  • The tokens returned from ChineseTokenizer are C1, C2, C3, C4
  • - ///
  • The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
  • - ///
- ///

- ///

+ /// + /// The tokens returned from ChineseTokenizer are C1, C2, C3, C4 + /// The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4. + /// + /// + /// /// Therefore the index created by CJKTokenizer is much larger. - ///

- ///

+ /// + /// /// The problem is that when searching for C1, C1C2, C1C3, /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the /// CJKTokenizer will not work. - ///

- ///
+ /// + /// + [Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardTokenizer} instead, which has the same functionality. This filter will be removed in Lucene 5.0")] public sealed class ChineseTokenizer : Tokenizer { public ChineseTokenizer(TextReader _in) : base(_in) { - Init(); - } - - public ChineseTokenizer(AttributeSource source, TextReader _in) - : base(source, _in) - { - Init(); + termAtt = AddAttribute(); + offsetAtt = AddAttribute(); } public ChineseTokenizer(AttributeFactory factory, TextReader _in) : base(factory, _in) { - Init(); - } - - private void Init() - { - termAtt = AddAttribute(); + termAtt = AddAttribute(); offsetAtt = AddAttribute(); } @@ -90,8 +83,8 @@ private void Init() private int length; private int start; - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; + private readonly ICharTermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; private void Push(char c) { @@ -101,10 +94,9 @@ private void Push(char c) private bool Flush() { - if (length > 0) { - termAtt.SetTermBuffer(buffer, 0, length); + termAtt.CopyBuffer(buffer, 0, length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return true; } @@ -112,7 +104,6 @@ private bool Flush() return false; } - public override bool IncrementToken() { ClearAttributes(); @@ -123,7 +114,6 @@ public override bool IncrementToken() while (true) { - char c; offset++; @@ -133,7 +123,7 @@ public override bool IncrementToken() bufferIndex = 0; } - if (dataLen == 0) + if (dataLen == -1) { offset--; return Flush(); @@ -181,11 +171,5 @@ public override void Reset() base.Reset(); offset = bufferIndex = dataLen = 0; } - - public override void Reset(TextReader input) - { - base.Reset(input); - Reset(); - } } } diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj index e13f118363..5ba1857ce8 100644 --- a/src/contrib/Analyzers/Contrib.Analyzers.csproj +++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj @@ -103,6 +103,9 @@ + + + @@ -170,7 +173,6 @@ - From 9b08c3fe183d6c7e9062c1e659ed1efb051737a8 Mon Sep 17 00:00:00 2001 From: Athrun Saga Date: Sat, 10 Aug 2013 21:17:02 +0800 Subject: [PATCH 2/4] Port: Analyzers for CN --- .../Analyzers/Cn/ChineseFilterFactory.cs | 53 ++++++++++++++++++ .../Analyzers/Cn/ChineseTokenizerFactory.cs | 55 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 src/contrib/Analyzers/Cn/ChineseFilterFactory.cs create mode 100644 src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs diff --git a/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs b/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs new file mode 100644 index 0000000000..ea01e9b17f --- /dev/null +++ b/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs @@ -0,0 +1,53 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Lucene.Net.Analysis.Util; + +namespace Lucene.Net.Analysis.Cn +{ + /// + /// Factory for + /// + [Obsolete("Use {Lucene.Net.Analysis.Core.StopFilterFactory} instead.")] + public class ChineseFilterFactory : TokenFilterFactory + { + /// + /// Creates a new ChineseFilterFactory + /// + public ChineseFilterFactory(IDictionary args) + : base(args) + { + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream _in) + { + return new ChineseFilter(_in); + } + } +} diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs b/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs new file mode 100644 index 0000000000..91c33a481c --- /dev/null +++ b/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs @@ -0,0 +1,55 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Cn +{ + /// + /// Factory for + /// + [Obsolete("Use {Lucene.Net.Analysis.Standard.StandardTokenizerFactory} instead.")] + public class ChineseTokenizerFactory : TokenizerFactory + { + /// + /// Creates a new ChineseTokenizerFactory + /// + public ChineseTokenizerFactory(IDictionary args) + : base(args) + { + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader _in) + { + return new ChineseTokenizer(factory, _in); + } + } +} From cb2c8c767524db91decde5543b967350ee679c1e Mon Sep 17 00:00:00 2001 From: Athrun Saga Date: Sat, 10 Aug 2013 22:07:09 +0800 Subject: [PATCH 3/4] Port: Analyzer for CN --- src/contrib/Analyzers/Contrib.Analyzers.csproj | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj index 5ba1857ce8..d5e68bba59 100644 --- a/src/contrib/Analyzers/Contrib.Analyzers.csproj +++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj @@ -105,7 +105,9 @@ + + From c2babbf63da30a26f94363c055ccc802012be515 Mon Sep 17 00:00:00 2001 From: Athrun Saga Date: Sun, 11 Aug 2013 22:28:16 +0800 Subject: [PATCH 4/4] Fix: IncrementToken method fault in drop-dead halt. --- src/contrib/Analyzers/Cn/ChineseTokenizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs index 756e692c20..9173327910 100644 --- a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs +++ b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs @@ -123,7 +123,7 @@ public override bool IncrementToken() bufferIndex = 0; } - if (dataLen == -1) + if (dataLen == 0) { offset--; return Flush();