using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using Lucene.Net.Store;
using System.IO;
using Lucene.Net.Index;
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using log4net;
using System.Net;
using Lucene.Net.Search;
using System.Text;
using mshtml;
using PanGu;
using System.Xml.Linq;
using System.Text.RegularExpressions;
namespace RPSearch.Test
{
public partial class indexText : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
private ILog logger = LogManager.GetLogger(typeof(indexText));
protected void Button1_Click(object sender, EventArgs e)
{
string indexPath = @"F:\如鹏项目\索引";
//
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());//directory表示索引文件
bool isUpdate = IndexReader.IndexExists(directory);//判断是否为索引目录
if (isUpdate)
{
//如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);//指定分词器,将文章分词存入索引库中
WebClient wc = new WebClient();//WebClient 类提供向 URI 标识的任何本地、Intranet 或 Internet 资源发送数据以及从这些资源接收数据的公共方法
wc.Encoding = Encoding.UTF8;//将字符串转换为utf-8类型
int maxId = GetMaxId();
for (int i = 1000; i < maxId; i++)
{
string url = "http://localhost:32768/showtopic-"+ i+".aspx" ;
string html = wc.DownloadString(url);
HTMLDocumentClass doc = new HTMLDocumentClass();//mshtml 解析 网页中的文本 <SPAN style="COLOR: #ff0000">*IE 就使用的此方法解析</SPAN>
doc.designMode = "on";//不让解析引擎尝试运行javascript
doc.IHTMLDocument2_write(html);
doc.close();
string title = doc.title;
string body = doc.body.innerText;//去掉标签可以使用 document.getElementById()
//为避免重复索引,先删掉number=1的记录,再重新添加,否则:就会成倍增加
writer.DeleteDocuments(new Term("number",i.ToString()));
//只有对需要全文搜索的内容才要ANALYZED
Document document = new Document();
document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("body", html, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(document);
logger.Debug("索引"+i+"下载完毕");
Console.WriteLine("索引" + i + "完毕");
}
writer.Close();
directory.Close();//不要忘了Close,否则索引结果搜不到
logger.Debug( "全部下载完毕");
}
protected void Button2_Click(object sender, EventArgs e)
{
string indexPath = @"F:\如鹏项目\索引";
string kw = TextBox1.Text;
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
IndexReader reader = IndexReader.Open(directory, true);
IndexSearcher searcher = new IndexSearcher(reader);
PhraseQuery query = new PhraseQuery();
//
foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业”
{
query.Add(new Term("body", word));
}
query.SetSlop(100);
TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
searcher.Search(query, null, collector);
ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
List<SearchResult> listResult = new List<SearchResult>();
for (int i = 0; i < docs.Length; i++)
{
int docId = docs[i].doc;//取到文档的编号(主键,由lucene.net分配)
//检索结果中只有文档的id,如果要取Document,则需要Doc在去取
//降低了内存的占用
Document doc = searcher.Doc(docId);
string number = doc.Get("number");
string title = doc.Get("title");
string body = doc.Get("body");
//Console.WriteLine(doc.Get("number"));
//Console.WriteLine(doc.Get("body"));
SearchResult result = new SearchResult();
//Response.Write(number);
result.Number = number;
result.Title = title;
result.BodyPreview = Preview(body,TextBox1.Text);
listResult.Add(result);
//Response.Write(body + "<br/>");
//Response.Write(number + "<br/>");
}
Repeater1.DataSource = listResult;
Repeater1.DataBind();
}
private static string Preview(string body,string keyword)
{
//创建HTMLFormatter,参数为高亮单词的前后缀
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight
.SimpleHTMLFormatter("<font color=\"red\">","</font>");
//创建Highlighter,输入HTMLFormatter和盘古分词对象Semgent
PanGu.HighLight.Highlighter highlighter=new PanGu.HighLight.Highlighter (simpleHTMLFormatter,new Segment());
//设置每个摘要端的字符数
highlighter.FragmentSize=100;
//获取最匹配的摘要段
string bodyPreview=highlighter.GetBestFragment(keyword,body);
return bodyPreview;
}
private int GetMaxId()
{
XDocument xdoc = XDocument.Load("http://localhost:32768/tools/rss.aspx");
XElement channel = xdoc.Root.Element("channel");
XElement fristItem = channel.Elements("item").First();
XElement link = fristItem.Element("link");
Match match = Regex.Match(link.Value,@"showtopic-(\d+)\.aspx");
string id = match.Groups[1].Value;
return Convert.ToInt32(id);
}
}
}
这个错误应该是死锁引起的,每次创建writer都会创建相应的writer.lock,同一时间只允许有且只有一个存在。