我写的例子抓取网页源码,可得到的不是想要的:如:抓取链接"http://www.google.com.hk/search?&num=20&q=%e6%ad%a6%e5%a4%a7"的源码,得到的内容与打开此链接看到的不同。求帮助,不胜感激。。。。。。。
class Program
{
static string ChangeKeywordEncoding(string keyword, string codeType)
{
return HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(codeType));
}
static string GetHttpPageSourceCode(string url, string codeType)
{
//判断网页编码
Encoding wCode;
HttpWebRequest wReq = (HttpWebRequest)WebRequest.Create(@url);
wReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
wReq.Referer = url;
wReq.Method = "GET";
string strWebData;
//设置页面超时时间为30秒
wReq.Timeout = 30000;
HttpWebResponse wResp = (HttpWebResponse)wReq.GetResponse();
Stream respStream = wResp.GetResponseStream();
switch (codeType)
{
case "auto":
wCode = Encoding.Default;
string cType = wResp.ContentType.ToLower();
Match charSetMatch = Regex.Match(cType, "(?<=charset=)([^<]*)*", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.ToString();
wCode = Encoding.GetEncoding(webCharSet);
wCode = Encoding.Default;
break;
case "gb2312":
wCode = Encoding.GetEncoding("gb2312");
break;
case "gbk":
wCode = Encoding.GetEncoding("gbk");
break;
case "utf-8":
wCode = Encoding.UTF8;
break;
default:
wCode = Encoding.UTF8;
break;
}
StreamReader reader = new StreamReader(respStream, wCode);
strWebData = reader.ReadToEnd();
reader.Close();
reader.Dispose();
return strWebData;
}
static void Main(string[] args)
{
StreamWriter sw = new StreamWriter("123.txt",false,Encoding.GetEncoding("GB2312"));
string keyword=Console.ReadLine();
string baseURL = "http://www.google.com.hk/search?&num=20&q="+ChangeKeywordEncoding(keyword, "utf-8") ;
string sourceCode = GetHttpPageSourceCode(baseURL, "utf-8");
sw.WriteLine(sourceCode);
Console.Read();
}
}
可得到的不是想要的?具体点怎么不一样?
代码貌似很对..
使用webclient来获取远程页面貌似更好啊。
private string GetHtml(string url)
{
try
{
WebClient myClient = new WebClient();
myClient.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; QQPinyin 730; EmbeddedWB 14.52 from: http://www.bsalsa.com/ EmbeddedWB 14.52; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)");
byte[] result = myClient.DownloadData(url);
string temp = Encoding.GetEncoding("gb2312").GetString(result);
return temp;
}
catch (Exception ex)
{
return null;
}
}