看到Content-Type中有GBK charest
所以我Encoding.GetEncoding("GBK"), 但是是乱码…… 试过utf-8, gb2312.。 默认的都试过了, 奇怪了, 我的系统是中文windows7系统。
以下是完整代码:
1 public static string GetHtml2(string url) 2 {
url="http://123.sogou.com"; 3 HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); 4 HttpWebResponse reponse = (HttpWebResponse)request.GetResponse(); 5 string contentType = reponse.Headers["Content-Type"]; 6 Encoding encoding = null; 7 Regex regex = new Regex("charset\\s*=\\s*(\\S+)", RegexOptions.IgnoreCase); 8 Match match = null; 9 if (contentType != null) 10 { 11 match = regex.Match(contentType); 12 if (match.Success) 13 { 14 try 15 { 16 encoding = Encoding.GetEncoding(match.Groups[1].Value.Trim()); 17 using (TextReader reader = new StreamReader(reponse.GetResponseStream(), encoding)) 18 { 19 string str = reader.ReadToEnd(); 20 return str; 21 } 22 } 23 catch (Exception exx) 24 { 25 return "无法获取"; 26 } 27 } 28 } 29 if (contentType == null || (!match.Success)) 30 { 31 using (TextReader reader = new StreamReader(reponse.GetResponseStream(), Encoding.Default)) 32 { 33 string str = reader.ReadToEnd(); 34 regex = new Regex("<\\s*meta.+charset\\s*=\\s*(\\S+)\\s*\"", RegexOptions.IgnoreCase); 35 match = regex.Match(str); 36 if (match.Success) 37 { 38 try 39 { 40 encoding = Encoding.GetEncoding(match.Groups[1].Value.Trim()); 41 str = encoding.GetString(Encoding.Default.GetBytes(str)); 42 return str; 43 } 44 catch (Exception exx) 45 { 46 return "无法获取"; 47 } 48 } 49 else 50 { 51 encoding = Encoding.Default; 52 str = encoding.GetString(Encoding.Default.GetBytes(str)); 53 return str; 54 } 55 } 56 } 57 return null; 58 }
把这个Encoding.GetEncoding(match.Groups[1].Value.Trim());
使用这个函数吧encodeURIComponent(Encoding.GetEncoding(match.Groups[1].Value.Trim()));
写爬虫的童鞋可以试试神箭手云爬虫,自带JS渲染、代理ip、验证码识别等功能,还可以发布和导出爬取的数据,生成图表等,都在云端进行,不需要安装开发环境。