//从返回的网页源代码中过滤出Email并存入记事本
private void GetEmail()
{
int strEndId = Convert.ToInt32(txtEnd.Text.Trim());
int strDiZengId = Convert.ToInt32(txtDiZengId.Text.Trim());
string Url = this.txtUrl.Text.Trim();
int urlStar;
int strJieQuUrl = Convert.ToInt32(TruncStr(TruncBeginStr(Url, "cn/", 3), ".html"));
urlStar = strJieQuUrl;
string strWeb = string.Empty;
for (int i = urlStar; i < strEndId + 1; i++)
{
string aa = string.Format("http://www.yellowurl.cn/{0}.html", strJieQuUrl.ToString());
string strWebContent = GetWebContent(aa);
if (!string.IsNullOrEmpty(strWebContent))
{
int iBodyStart = strWebContent.IndexOf("<body", 0);
int iStart = strWebContent.IndexOf("<table", iBodyStart);
int iTableStart = strWebContent.IndexOf("电子邮件", iStart);
int iTableEnd = strWebContent.IndexOf("Q Q", iTableStart);
strWeb = StripAllTags(strWebContent.Substring(iTableStart+5, (iTableEnd - 3) - (iTableStart + 5)));
StreamWriterMetod(strWeb);
strJieQuUrl = strJieQuUrl + strDiZengId;
}
}
}
//取得网页源代码
private string GetWebContent(string Url)
{
string strResult = "";
//测试用网址
string urlTest = "http://www.yellowurl.cn/1581812.html";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
return strResult;
}
//将取出的Email写入记事本
private void StreamWriterMetod(string strEmail)
{
//try
//{
//FileStream fsFile = new FileStream(@"e:\log.txt", FileMode.OpenOrCreate);
StreamWriter swWriter = File.AppendText(@"e:\log.txt");
//StreamWriter swWriter = new StreamWriter(fsFile);
swWriter.WriteLine(strEmail);
swWriter.Flush();
swWriter.Close();
//}
//catch (Exception e)
//{
// throw e;
//}
}
//去除HTML标记
public static string StripAllTags(string stringToStrip)
{
stringToStrip = Regex.Replace(stringToStrip, "</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
stringToStrip = Regex.Replace(stringToStrip, "<br(?:\\s*)/>", "\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
stringToStrip = Regex.Replace(stringToStrip, "\"", "''", RegexOptions.IgnoreCase | RegexOptions.Compiled);
stringToStrip = Regex.Replace(stringToStrip, "<[^>]+>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
stringToStrip = Regex.Replace(stringToStrip, "&[^;]+;", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
return stringToStrip;
}
public void test()
{
string str = "<table><tr><td>123@qq.com</td></tr></table>";//网页代码
string pattern = "";//根据你自己的需求写上正则
//Match match = Regex.Match(str,pattern);
//match.Groups[1].Value;//这个就是你要取的值
//上面注释的代码是匹配单条数据情况下,下面匹配多条数据
MatchCollection match = Regex.Matches(str,pattern);
for (int i = 0; i < match.Count; i++)
{
//循环读取match内容
}
}
获得网页源码之后通过Rwgex.match或者Rwgex.matchs方法去匹配出Email地址,以前做过类似的东西,速度并不是很慢,主要是在发请求的时候要耽误时间。
多开几个线程吧
我今天也正好做这方面的测试,就一个页面,一个正则来匹配,不过速度实在太慢,只要页面内容多,匹配就慢
功能也是获取网页源码,然后用正则提取邮箱地址,也不知道是正则写得有问题,还是页面内容太多的缘故。正则是经过测试的,在小内容时可以很好的识别出邮箱地址。