首页 新闻 会员 周边

Lucene 索引过滤掉了 标点么?

0
悬赏园豆:50 [已关闭问题] 关闭于 2013-01-09 15:06

我搜 CS1.5  我自己写了分词,把CS1.5 分解成 “CS”  和 "1.5"

在索引中竟然搜不到结果,为什么 为什吗?

下面的源码很清楚,搜索不到结果,为什么为什么为什么??怎么解决??

 

 

using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Util;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using System.Text.RegularExpressions;
using PanGu;
using PanGu.Dict;



namespace LuceneTest
{
    class Program
    {

        static Regex RNumber = new Regex(@"[\d\.]+");
        static Regex RABC = new Regex(@"[a-zA-Z]+");
        static Regex RCN = new Regex(@"[\u4e00-\u9fa5]+");

        static string getWords(string keywords)
        {
            PanGu.Segment.Init();

            string cnString = string.Empty;
            if (RCN.IsMatch(keywords))
            {
                MatchCollection MC = RCN.Matches(keywords);
                for (int i = 0; i < MC.Count; i++)
                {
                    cnString += " " + MC[i].Value;
                }

                cnString = cnString.Trim();


                Segment segment = new Segment();
                ICollection<WordInfo> words = segment.DoSegment(cnString);
                StringBuilder result = new StringBuilder();
                foreach (WordInfo wordInfo in words)
                {
                    if (wordInfo == null)
                    {
                        continue;
                    }

                    result.Append(wordInfo.Word);
                    result.Append(" ");
                }

                cnString = result.ToString();
            }

            string numString = string.Empty;
            if (RNumber.IsMatch(keywords))
            {
                MatchCollection MC = RNumber.Matches(keywords);
                for (int i = 0; i < MC.Count; i++)
                {
                    numString += " " + MC[i].Value;
                }
                numString = numString.Trim();
            }

            string abcString = string.Empty;
            if (RABC.IsMatch(keywords))
            {
                MatchCollection MC = RABC.Matches(keywords);
                for (int i = 0; i < MC.Count; i++)
                {
                    abcString += " " + MC[i].Value;
                }
                abcString = abcString.Trim();
            }


            return (cnString + " " + abcString + " " + numString).Trim().Replace("  ", " ");













        }

        static void Main(string[] args)
        {
            Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);

            IndexWriter writer = new IndexWriter("IndexDir", analyzer, true);


            AddDocument(writer, "暗黑破坏神1", "暗黑破坏神1很好玩");
            AddDocument(writer, "暗黑破坏神2", "暗黑破坏神2很好玩");
            AddDocument(writer, "暗黑破坏神3", "暗黑破坏神3很好玩");
            AddDocument(writer, "暗黑破坏神4", "暗黑破坏神4很好玩");
            AddDocument(writer, "反恐精英CS1.5版本", "反恐精英CS1.5很好玩");
            AddDocument(writer, "我的暗黑神啊", "我的暗黑神啊我的暗黑神啊");
            AddDocument(writer, "反恐精英CS1.6版本", "反恐精英CS1.6很好玩");

            writer.Optimize();
            writer.Close();



            IndexSearcher searcher = new IndexSearcher("IndexDir", true);
            MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "title", "content" }, analyzer);
            string words = getWords("CS1.5"); //此处搜索CS1.5

            Query query = parser.Parse(words);
            Hits hits = searcher.Search(query);

            for (int i = 0; i < hits.Length(); i++)
            {
                Document doc = hits.Doc(i);
                Console.WriteLine(string.Format("title:{0} content:{1}", doc.Get("title"), doc.Get("content")));
            }
            searcher.Close();
            Console.ReadKey();

        }


        static void AddDocument(IndexWriter writer, string title, string content)
        {
            Document document = new Document();
            document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
            document.Add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(document);
        }
    }
}
fun5的主页 fun5 | 初学一级 | 园豆:4
提问于:2012-06-20 10:53
< >
分享
所有回答(1)
0

你的IndexWriter使用的是StandardAnalyzer,这个分词器会过滤一些StopWords(English),作为英文句号'.'当然会被过滤,你可以在STOP_WORDS_SET中看到,当然你清除了这个集合就不会过滤了。

today4king | 园豆:3499 (老鸟四级) | 2012-06-20 12:03

我找不到那个"." 我要把那个 "."去掉

支持(0) 反对(0) fun5 | 园豆:4 (初学一级) | 2012-06-20 12:06
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册