我搜 CS1.5 我自己写了分词,把CS1.5 分解成 “CS” 和 "1.5"
在索引中竟然搜不到结果,为什么 为什吗?
下面的源码很清楚,搜索不到结果,为什么为什么为什么??怎么解决??
using System; using System.Collections.Generic; using System.Text; using Lucene.Net.Index; using Lucene.Net.Store; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Documents; using Lucene.Net.Util; using Lucene.Net.Search; using Lucene.Net.QueryParsers; using System.Text.RegularExpressions; using PanGu; using PanGu.Dict; namespace LuceneTest { class Program { static Regex RNumber = new Regex(@"[\d\.]+"); static Regex RABC = new Regex(@"[a-zA-Z]+"); static Regex RCN = new Regex(@"[\u4e00-\u9fa5]+"); static string getWords(string keywords) { PanGu.Segment.Init(); string cnString = string.Empty; if (RCN.IsMatch(keywords)) { MatchCollection MC = RCN.Matches(keywords); for (int i = 0; i < MC.Count; i++) { cnString += " " + MC[i].Value; } cnString = cnString.Trim(); Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(cnString); StringBuilder result = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } result.Append(wordInfo.Word); result.Append(" "); } cnString = result.ToString(); } string numString = string.Empty; if (RNumber.IsMatch(keywords)) { MatchCollection MC = RNumber.Matches(keywords); for (int i = 0; i < MC.Count; i++) { numString += " " + MC[i].Value; } numString = numString.Trim(); } string abcString = string.Empty; if (RABC.IsMatch(keywords)) { MatchCollection MC = RABC.Matches(keywords); for (int i = 0; i < MC.Count; i++) { abcString += " " + MC[i].Value; } abcString = abcString.Trim(); } return (cnString + " " + abcString + " " + numString).Trim().Replace(" ", " "); } static void Main(string[] args) { Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); IndexWriter writer = new IndexWriter("IndexDir", analyzer, true); AddDocument(writer, "暗黑破坏神1", "暗黑破坏神1很好玩"); AddDocument(writer, "暗黑破坏神2", "暗黑破坏神2很好玩"); AddDocument(writer, "暗黑破坏神3", "暗黑破坏神3很好玩"); AddDocument(writer, "暗黑破坏神4", "暗黑破坏神4很好玩"); AddDocument(writer, "反恐精英CS1.5版本", "反恐精英CS1.5很好玩"); AddDocument(writer, "我的暗黑神啊", "我的暗黑神啊我的暗黑神啊"); AddDocument(writer, "反恐精英CS1.6版本", "反恐精英CS1.6很好玩"); writer.Optimize(); writer.Close(); IndexSearcher searcher = new IndexSearcher("IndexDir", true); MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "title", "content" }, analyzer); string words = getWords("CS1.5"); //此处搜索CS1.5 Query query = parser.Parse(words); Hits hits = searcher.Search(query); for (int i = 0; i < hits.Length(); i++) { Document doc = hits.Doc(i); Console.WriteLine(string.Format("title:{0} content:{1}", doc.Get("title"), doc.Get("content"))); } searcher.Close(); Console.ReadKey(); } static void AddDocument(IndexWriter writer, string title, string content) { Document document = new Document(); document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(document); } } }
你的IndexWriter使用的是StandardAnalyzer,这个分词器会过滤一些StopWords(English),作为英文句号'.'当然会被过滤,你可以在STOP_WORDS_SET中看到,当然你清除了这个集合就不会过滤了。
我找不到那个"." 我要把那个 "."去掉