代码为:
#region 采集
/// <summary>
/// 获取指定远程网页内容
/// </summary>
/// <param name="strUrl">所要查找的远程网页地址</param>
/// <param name="timeout">超时时长设置,一般设置为8000</param>
/// <param name="enterType">是否输出换行符,0不输出,1输出文本框换行</param>
/// <param name="EnCodeType">编码方式</param>
public static string GetRequestString(string strUrl, int timeout, int enterType, Encoding EnCodeType, params string[] Parms)
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(strUrl);
//myReq.CachePolicy = noCachePolicy;
myReq.Method = "GET";
myReq.KeepAlive = true;
Uri baseUri = new Uri(strUrl);
myReq.Timeout = timeout;
myReq.Headers["Accept-Encoding"] = "gzip,deflate";
myReq.Headers["Accept-Language"] = "zh-cn,zh;q=0.5";
// myReq.Accept Accept-Languagezh-cn,zh;q=0.5
myReq.Headers["Accept-Charset"] = "GB2312,utf-8;q=0.7,*;q=0.7";
if (Parms != null)
{
if(!string.IsNullOrEmpty(Parms[0]))
myReq.Headers["Cookie"] = Parms[0];
}
myReq.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
myReq.Referer = strUrl;
myReq.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15";
//myReq
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
string rpheader = null;
if (HttpWResp.Headers["Content-Encoding"] != null)
{
rpheader = HttpWResp.Headers["Content-Encoding"].ToString();
}
int size = 0;
string uncompressString = string.Empty;
StringBuilder sb = new StringBuilder(40960);
if (rpheader != null)
{
byte[] writeData = new byte[4096];
int totalLength = 0;
Stream s = new GZipInputStream(myStream);
while (true)
{
size = s.Read(writeData, 0, writeData.Length);
if (size > 0)
{
totalLength += size;
sb.Append(EnCodeType.GetString(writeData, 0, size));
}
else
{
break;
}
}
s.Flush();
s.Close();
if (enterType != 1)
{
sb.Replace("\r\n", "");
sb.Replace("\r", "");
sb.Replace("\n", "");
}
}
else
{
StreamReader sr = new StreamReader(myStream, EnCodeType);
while (-1 != sr.Peek())
{
sb.Append(sr.ReadLine());
if (enterType != 1)
{
sb.Replace("\r\n","");
sb.Replace("\r", "");
sb.Replace("\n", "");
}
}
}
strResult = sb.ToString();
}
catch (Exception err)
{
strResult = "请求错误:" + err.Message;
throw new Exception("请求错误:" + err.Message);
return strResult;
}
return strResult;
}
#endregion
采集网页时:有时100页超时,有时 10几页就超时。 但一开 Fiddler2 监视工具一会就好了,不知道为什么。
原因有二:
一、你采集的网站服务器不堪压力。
二、网站可能有防采集防护措施。这个数目不同的网站设置的不一样。百度贴吧的话开始限制后会是固定的20还是40页之后一段时间不能访问,记不清了。个人觉得中华英才这方面做得不错。
如果是你要充分利用自己的带宽而对方没有访问限制。那么多线程是个不错的选择。在一个请求等待时其他线程也没闲着
尝试了好多次 都是 一开 Fiddler2 监视工具一会就好了,代码有啥问题?
@问题很严重,加班吧。: 尝试添加请求头 X-Forwarded-For 值 是任意ip