/*下面是代码,代码没有问题 读别的页 一般没有问题,但是读带有百度百科的 到了百科就中断, 就返回字符串了。不是完整的html代码。只有一部分;不知道怎么处理了。。。。。。。。
请各位帮帮忙,困扰好久了。*/
//调用 GetHtml("http://www.soso.com/q?pid=s.idx&cid=s.idx.se&w=%D1%B5%C1%B7%CB%FE"); public string GetHtml(string url) { //判断网页编码 Encoding wCode; string PostPara = ""; CookieContainer CookieCon = new CookieContainer(); HttpWebRequest wReq; wReq = (HttpWebRequest)WebRequest.Create(@url); wReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)"; Match a = Regex.Match(url, @"(http://).[^/]*[?=/]", RegexOptions.IgnoreCase); string url1 = a.Groups[0].Value.ToString(); wReq.Referer = url1; wReq.Method = "GET"; //设置页面超时时间为12秒 wReq.Timeout = 12000; HttpWebResponse wResp = (HttpWebResponse)wReq.GetResponse(); System.IO.Stream respStream = wResp.GetResponseStream(); string strWebData = ""; try { string cType = wResp.ContentType.ToLower(); Match charSetMatch = Regex.Match(cType, "(?<=charset=)([^<]*)*", RegexOptions.IgnoreCase | RegexOptions.Multiline); string webCharSet = charSetMatch.ToString(); wCode = System.Text.Encoding.GetEncoding(webCharSet); } catch { wCode = Encoding.Default; } if (wResp.ContentEncoding == "gzip") { GZipStream myGZip = new GZipStream(respStream, CompressionMode.Decompress); System.IO.StreamReader reader; reader = new System.IO.StreamReader(myGZip, wCode); strWebData = reader.ReadToEnd(); reader.Close(); reader.Dispose(); } else { System.IO.StreamReader reader; reader = new System.IO.StreamReader(respStream, wCode); strWebData = reader.ReadToEnd(); reader.Close(); reader.Dispose(); } // this.m_WebpageSource = strWebData; return strWebData; }
wReq.KeepAlive = true;
缺少这个设置啊.
我试了一下不管用呀,,他还是到了百科就断了。。。
@格雷: 红色的代码我改过...你可以使用fiddler2这个工具来抓包看浏览器正常访问提交了什么数据的.
public string GetHtml(string url) { //判断网页编码 Encoding wCode; string PostPara = ""; CookieContainer CookieCon = new CookieContainer(); HttpWebRequest wReq; wReq = (HttpWebRequest)WebRequest.Create(@url); wReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)"; Match a = Regex.Match(url, @"(http://).[^/]*[?=/]", RegexOptions.IgnoreCase); string url1 = a.Groups[0].Value.ToString(); wReq.Referer = url1; wReq.Method = "GET"; wReq.Accept = "text/html, application/xhtml+xml, */*"; wReq.Headers.Add("Accept-Encoding", "gzip, deflate"); wReq.Headers.Add("Accept-Language", "zh-cn"); wReq.Headers.Add("Accept-Charset", "GBK,utf-8;"); //设置页面超时时间为12秒 wReq.Timeout = 12000; wReq.KeepAlive = true; HttpWebResponse wResp = (HttpWebResponse)wReq.GetResponse(); string strWebData = ""; try { string cType = wResp.ContentType.ToLower(); Match charSetMatch = Regex.Match(cType, "(?<=charset=)([^<]*)*", RegexOptions.IgnoreCase | RegexOptions.Multiline); string webCharSet = charSetMatch.ToString(); wCode = System.Text.Encoding.GetEncoding(webCharSet); } catch { wCode = Encoding.Default; } if (wResp.ContentEncoding == "gzip") { System.IO.Stream respStream = wResp.GetResponseStream(); GZipStream myGZip = new GZipStream(respStream, CompressionMode.Decompress); System.IO.StreamReader reader; reader = new System.IO.StreamReader(myGZip, wCode); strWebData = reader.ReadToEnd(); reader.Close(); reader.Dispose(); } else { //System.IO.StreamReader reader; using (Stream responseStream = wResp.GetResponseStream()) { using (StreamReader reader = new StreamReader(responseStream, wCode)) { strWebData = reader.ReadToEnd(); } } } return strWebData; }
@袁佳文:
谢谢!
我调用你的方法
GetHtml("http://www.soso.com/q?pid=s.idx&cid=s.idx.se&w=%D1%B5%C1%B7%CB%FE");
然后把得到的字符串写到 txt中是个完整的html 代码;
但是我不是需要查看或者是写到什么地方 我是要操作这个字符串来获取网站排名,但是当我操作返回的字符串时,他就成了
一部分 的 html了,
比如:
我现在 搜的词是 训练塔 想要获取 www.hc360.com的排名 ,但是 我在返回的字符串中 根本找不不到这个网址,
如果用浏览器查看搜搜 的搜索结果他在第二名,第一名是百度百科,返回的字符串就停在了在百度百科中间
没找到是什么原因
谢谢了,不知道还有没有其他的方式可以获取 搜搜的排名呢。。只能说对网页在处理吗
用 WebClient.DownloadString(url) 不行吗
谢谢,刚刚试过了。。不行,和我得到的一样