public static string GetHtml(string strLink) { Encoding encoding; StreamReader reader = null; string strResults; HttpWebRequest request = null; HttpWebResponse response = null; try { if (strLink.IndexOf(@"http://") >= 0) { request = (HttpWebRequest)WebRequest.Create(strLink); } else { request = (HttpWebRequest)WebRequest.Create(string.Format(@"http://{0}", strLink)); } switch ((DateTime.Now.Second % 2)) { case 0: request.UserAgent = "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1;.NET CLR 2.0.50727)"; break; default: request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24"; break; } response = (HttpWebResponse)request.GetResponse(); if (response.CharacterSet != null && response.CharacterSet.ToLower() == "utf-8") { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding(936); } reader = new StreamReader(response.GetResponseStream(), encoding); strResults = Regex.Replace(reader.ReadToEnd(), @"[\r\n]", " ", RegexOptions.IgnoreCase); if (strLink != response.ResponseUri.ToString()) { strResults = "|RedirectWeb|" + strResults; } return strResults; } catch { return string.Empty; } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } if (reader != null) { reader.Close(); reader.Dispose(); } } }
利用以上函数获取html网页源码.利用正则表达式,匹配.
先取到网页源码,然后用正则表达式取
个人信息就这样被搞走了,还有悲催的垃圾邮件/电话 ETC等着你。
找到具体的简历地址(或其他有效的url地址),httprequest请求此url,获取返回的全部网页代码,利用字符串截取到相应的数据。
protected string GetHtmlSource(string url, string encoding)
{
string html = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, Encoding.GetEncoding(encoding));
html = reader.ReadToEnd();
stream.Close();
}
catch (Exception e)
{
}
return html;
}