From 1d9aacdbc13973393e845283c811005af3beeec6 Mon Sep 17 00:00:00 2001
From: Athrun Saga
Date: Sat, 10 Aug 2013 16:07:38 +0800
Subject: [PATCH 1/4] PORT: Analyzers for CN
---
src/contrib/Analyzers/Cn/ChineseAnalyzer.cs | 55 +++-------------
src/contrib/Analyzers/Cn/ChineseFilter.cs | 39 ++++++------
src/contrib/Analyzers/Cn/ChineseTokenizer.cs | 62 +++++++------------
.../Analyzers/Contrib.Analyzers.csproj | 4 +-
4 files changed, 56 insertions(+), 104 deletions(-)
diff --git a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs b/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
index 1ec050a455..f91ca1e25b 100644
--- a/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
+++ b/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
@@ -21,10 +21,6 @@
using System;
using System.IO;
-using System.Text;
-using System.Collections;
-
-using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.Cn
{
@@ -32,54 +28,21 @@ namespace Lucene.Net.Analysis.Cn
/// An that tokenizes text with and
/// filters with
///
+ [Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardAnalyzer} instead, which has the same functionality. This analyzer will be removed in Lucene 5.0")]
public class ChineseAnalyzer : Analyzer
{
-
- public ChineseAnalyzer()
- {
- }
-
- ///
- /// Creates a TokenStream which tokenizes all the text in the provided Reader.
- ///
- /// A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
- public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new ChineseTokenizer(reader);
- result = new ChineseFilter(result);
- return result;
- }
-
- private class SavedStreams
- {
- protected internal Tokenizer source;
- protected internal TokenStream result;
- };
-
///
- /// Returns a (possibly reused) which tokenizes all the text in the
- /// provided .
+ /// Creates
+ /// used to tokenize all the text in the provided .
///
///
- /// A built from a
- /// filtered with .
- ///
- public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ ///
+ /// built from a filtered with
+ ///
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
- /* tokenStream() is final, no back compat issue */
- SavedStreams streams = (SavedStreams) PreviousTokenStream;
- if (streams == null)
- {
- streams = new SavedStreams();
- streams.source = new ChineseTokenizer(reader);
- streams.result = new ChineseFilter(streams.source);
- PreviousTokenStream = streams;
- }
- else
- {
- streams.source.Reset(reader);
- }
- return streams.result;
+ Tokenizer source = new ChineseTokenizer(reader);
+ return new TokenStreamComponents(source, new ChineseFilter(source));
}
}
}
diff --git a/src/contrib/Analyzers/Cn/ChineseFilter.cs b/src/contrib/Analyzers/Cn/ChineseFilter.cs
index e5c83a5958..9d7e5cdbe7 100644
--- a/src/contrib/Analyzers/Cn/ChineseFilter.cs
+++ b/src/contrib/Analyzers/Cn/ChineseFilter.cs
@@ -24,31 +24,34 @@
using System.IO;
using System.Collections;
using System.Globalization;
+using System.Linq;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Cn
{
- // TODO: convert this XML code to valid .NET
///
- /// A {@link TokenFilter} with a stop word table.
- ///
- ///
Numeric tokens are removed.
- ///
English tokens must be larger than 1 char.
- ///
One Chinese char as one Chinese word.
- ///
+ /// A with a stop word table.
+ ///
+ /// Numeric tokens are removed.
+ /// English tokens must be larger than 1 char.
+ /// One Chinese char as one Chinese word.
+ ///
/// TO DO:
- ///
- ///
Add Chinese stop words, such as \ue400
- ///
Dictionary based Chinese word extraction
- ///
Intelligent Chinese word extraction
- ///
+ ///
+ /// Add Chinese stop words, such as \ue400
+ /// Dictionary based Chinese word extraction
+ /// Intelligent Chinese word extraction
+ ///
///
+ [Obsolete("(3.1) Use {Lucene.Net.Analysis.Core.StopFilter} instead, which has the same functionality. This filter will be removed in Lucene 5.0")]
public sealed class ChineseFilter : TokenFilter
{
// Only English now, Chinese to be added later.
- public static String[] STOP_WORDS =
+ public static readonly String[] STOP_WORDS =
{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
@@ -58,21 +61,21 @@ public sealed class ChineseFilter : TokenFilter
};
private CharArraySet stopTable;
- private ITermAttribute termAtt;
+ private ICharTermAttribute termAtt;
public ChineseFilter(TokenStream _in)
: base(_in)
{
- stopTable = new CharArraySet((IEnumerable)STOP_WORDS, false);
- termAtt = AddAttribute();
+ stopTable = new CharArraySet(Version.LUCENE_CURRENT, STOP_WORDS.ToList
- ///
+ ///
+ ///
/// For example, if the Chinese text
/// "C1C2C3C4" is to be indexed:
- ///
- ///
The tokens returned from ChineseTokenizer are C1, C2, C3, C4
- ///
The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
- ///
- ///
- ///
+ ///
+ /// The tokens returned from ChineseTokenizer are C1, C2, C3, C4
+ /// The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+ ///
+ ///
+ ///
/// Therefore the index created by CJKTokenizer is much larger.
- ///
- ///
+ ///
+ ///
/// The problem is that when searching for C1, C1C2, C1C3,
/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
/// CJKTokenizer will not work.
- ///
- ///
+ ///
+ ///
+ [Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardTokenizer} instead, which has the same functionality. This filter will be removed in Lucene 5.0")]
public sealed class ChineseTokenizer : Tokenizer
{
public ChineseTokenizer(TextReader _in)
: base(_in)
{
- Init();
- }
-
- public ChineseTokenizer(AttributeSource source, TextReader _in)
- : base(source, _in)
- {
- Init();
+ termAtt = AddAttribute();
+ offsetAtt = AddAttribute();
}
public ChineseTokenizer(AttributeFactory factory, TextReader _in)
: base(factory, _in)
{
- Init();
- }
-
- private void Init()
- {
- termAtt = AddAttribute();
+ termAtt = AddAttribute();
offsetAtt = AddAttribute();
}
@@ -90,8 +83,8 @@ private void Init()
private int length;
private int start;
- private ITermAttribute termAtt;
- private IOffsetAttribute offsetAtt;
+ private readonly ICharTermAttribute termAtt;
+ private readonly IOffsetAttribute offsetAtt;
private void Push(char c)
{
@@ -101,10 +94,9 @@ private void Push(char c)
private bool Flush()
{
-
if (length > 0)
{
- termAtt.SetTermBuffer(buffer, 0, length);
+ termAtt.CopyBuffer(buffer, 0, length);
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
return true;
}
@@ -112,7 +104,6 @@ private bool Flush()
return false;
}
-
public override bool IncrementToken()
{
ClearAttributes();
@@ -123,7 +114,6 @@ public override bool IncrementToken()
while (true)
{
-
char c;
offset++;
@@ -133,7 +123,7 @@ public override bool IncrementToken()
bufferIndex = 0;
}
- if (dataLen == 0)
+ if (dataLen == -1)
{
offset--;
return Flush();
@@ -181,11 +171,5 @@ public override void Reset()
base.Reset();
offset = bufferIndex = dataLen = 0;
}
-
- public override void Reset(TextReader input)
- {
- base.Reset(input);
- Reset();
- }
}
}
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index e13f118363..5ba1857ce8 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -103,6 +103,9 @@
+
+
+
@@ -170,7 +173,6 @@
-
From 9b08c3fe183d6c7e9062c1e659ed1efb051737a8 Mon Sep 17 00:00:00 2001
From: Athrun Saga
Date: Sat, 10 Aug 2013 21:17:02 +0800
Subject: [PATCH 2/4] Port: Analyzers for CN
---
.../Analyzers/Cn/ChineseFilterFactory.cs | 53 ++++++++++++++++++
.../Analyzers/Cn/ChineseTokenizerFactory.cs | 55 +++++++++++++++++++
2 files changed, 108 insertions(+)
create mode 100644 src/contrib/Analyzers/Cn/ChineseFilterFactory.cs
create mode 100644 src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs
diff --git a/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs b/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs
new file mode 100644
index 0000000000..ea01e9b17f
--- /dev/null
+++ b/src/contrib/Analyzers/Cn/ChineseFilterFactory.cs
@@ -0,0 +1,53 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Util;
+
+namespace Lucene.Net.Analysis.Cn
+{
+ ///
+ /// Factory for
+ ///
+ [Obsolete("Use {Lucene.Net.Analysis.Core.StopFilterFactory} instead.")]
+ public class ChineseFilterFactory : TokenFilterFactory
+ {
+ ///
+ /// Creates a new ChineseFilterFactory
+ ///
+ public ChineseFilterFactory(IDictionary args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream _in)
+ {
+ return new ChineseFilter(_in);
+ }
+ }
+}
diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs b/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs
new file mode 100644
index 0000000000..91c33a481c
--- /dev/null
+++ b/src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs
@@ -0,0 +1,55 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Cn
+{
+ ///
+ /// Factory for
+ ///
+ [Obsolete("Use {Lucene.Net.Analysis.Standard.StandardTokenizerFactory} instead.")]
+ public class ChineseTokenizerFactory : TokenizerFactory
+ {
+ ///
+ /// Creates a new ChineseTokenizerFactory
+ ///
+ public ChineseTokenizerFactory(IDictionary args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader _in)
+ {
+ return new ChineseTokenizer(factory, _in);
+ }
+ }
+}
From cb2c8c767524db91decde5543b967350ee679c1e Mon Sep 17 00:00:00 2001
From: Athrun Saga
Date: Sat, 10 Aug 2013 22:07:09 +0800
Subject: [PATCH 3/4] Port: Analyzer for CN
---
src/contrib/Analyzers/Contrib.Analyzers.csproj | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 5ba1857ce8..d5e68bba59 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -105,7 +105,9 @@
+
+
From c2babbf63da30a26f94363c055ccc802012be515 Mon Sep 17 00:00:00 2001
From: Athrun Saga
Date: Sun, 11 Aug 2013 22:28:16 +0800
Subject: [PATCH 4/4] Fix: IncrementToken method fault in drop-dead halt.
---
src/contrib/Analyzers/Cn/ChineseTokenizer.cs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
index 756e692c20..9173327910 100644
--- a/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
+++ b/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
@@ -123,7 +123,7 @@ public override bool IncrementToken()
bufferIndex = 0;
}
- if (dataLen == -1)
+ if (dataLen == 0)
{
offset--;
return Flush();