private static CookieCollection cookies = new CookieCollection(); private string GetHtml(string url) { HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.Method = "GET"; req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; HttpWebResponse res = null; res = req.GetResponse() as HttpWebResponse; Stream str = res.GetResponseStream(); cookies = res.Cookies; CookieContainer c = req.CookieContainer; string contenttype = res.ContentType; Encoding encode = System.Text.Encoding.Default; StreamReader sr = new StreamReader(str, encode); return sr.ReadToEnd(); } private string GetHtml(string url, string urlRef) { HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.Method = "GET"; req.Referer = urlRef; req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; CookieContainer c = new CookieContainer(); c.Add(cookies); req.CookieContainer = c; req.Accept = "application/javascript, */*;q=0.8"; req.AutomaticDecompression = DecompressionMethods.GZip; //req.Connection = "Keep-Alive"; HttpWebResponse res = null; res = req.GetResponse() as HttpWebResponse; Stream str = res.GetResponseStream(); string contenttype = res.ContentType; Encoding encode = System.Text.Encoding.GetEncoding("GB2312"); ; StreamReader sr = new StreamReader(str, encode); return sr.ReadToEnd(); }
做了个工具,抓取网站的Alexa的数据,从http://alexa.chinaz.com,无法获取到cookies,二次访问的时候,需要用到这些cookies
比如,http://alexa.chinaz.com/?domain=baidu.com,先是访问这个,然后得到源码后,访问里面一个Get_Data.asp,再得到IP、PV数据
要用response.Header["Set-Cookie"]去获取,下面是我做webqq登录时取cookies,你改下试试:
private static bool SaveCookies(string qq, QQResponse response) { if (response.Header == null) { return false; } //保存登录成功后的cookie string ckstr = response.Header["Set-Cookie"]; //登录失败 if (ckstr == null) { return false; } CookieContainer cc ; if (QQCookies.Keys.Contains(qq)) { cc = QQCookies[qq]; } else { cc = new CookieContainer(); } string[] items = ckstr.Split(new char[] { ',', ';' }, StringSplitOptions.RemoveEmptyEntries); string[] kv; string key = ""; foreach (string item in items) { kv = item.Split(new char[] { '=' }, StringSplitOptions.RemoveEmptyEntries); key = kv[0].Trim(); //去掉不是cookie项目 if (kv.Length < 2 || key == "EXPIRES" || key == "PATH" || key == "DOMAIN") { continue; } cc.Add(new Cookie { Domain = "ptlogin2.qq.com", Name = key, Value = kv[1] }); if (key == "ptwebqq") { PTWebqq[qq] = kv[1]; } } QQCookies[qq] = cc; return true; }
@情傷:
要看“响应标头”里面的
而且上面这个的方向是“已发送”,而不是服务器返回来的。
@向往-SONG:
逻辑是这样的,http://alexa.chinaz.com/?domain=smartshe.com访问之后,通过开发人员工具,可以看到,先是页面加载了<script src="http://alexa.chinaz.com/Get_Data.asp?a=18.7157057654076&b=34.1187872763419&c=26.2778330019881&d=20.0168986083499&e=18.7157057654076&f=≈ 1419.49204771372&g=≈ 29809.1540755467"></script>
这样一个JS,在这个JS里面,再返回数据,通过JS修改页面上IP、PV等数据的值,现在我传了来源页和Accept Http标头,但获取到的数据,和真实的数据不一致,怀疑是因为Cookie的问题
document.getElementById('Rank').innerHTML='2214';document.getElementById('DayRank').innerHTML='4037';document.getElementById('WeekRank').innerHTML='3109';document.getElementById('MonthRank').innerHTML='2368';document.getElementById('QuarterRank').innerHTML='2214';document.getElementById('NextRank').innerHTML='2214';document.getElementById('IpNum').innerHTML='鈮?1502.99105367793';document.getElementById('PvNum').innerHTML='鈮?31562.6332007952';
上面这是通过程序返回的数据
<predocument.getElementById('Rank').innerHTML='2,214'; document.getElementById('DayRank').innerHTML='4,037'; document.getElementById('WeekRank').innerHTML='3,109'; document.getElementById('MonthRank').innerHTML='2,368'; document.getElementById('QuarterRank').innerHTML='2,214';document.getElementById('NextRank').innerHTML='2,214';document.getElementById('IpNum').innerHTML='¡Ö 168,000';document.getElementById('PvNum').innerHTML='¡Ö 3,528,000';
这是开发人员工具得到的真实响应的数据.
@情傷:
你把请求http://alexa.chinaz.com/?domain=smartshe.com返回的header信息全部拿过来去发请求。
@向往-SONG:
具体怎么操作呢?全部塞里面好像不行的,会报错的,
{Content-Length: 32746
Cache-Control: private
Content-Type: text/html
Set-Cookie: User=Cookies=Have; expires=Tue, 17-Sep-2013 16:00:00 GMT; path=/,Alexa=Domain=%7Csmartshe%2Ecom%7C&Quary%5FTimes=1&Cur%5FIp=58%2E246%2E179%2E238&Last%5FTime=2012%2F9%2F18+20%3A53%3A07; expires=Tue, 18-Sep-2012 16:00:00 GMT; path=/,ASPSESSIONIDAACDCDRA=FJONLNNDGNBLLGIBLMCMCOIF; path=/
Server: Microsoft-IIS/7.5
X-Powered-By: ASP.NET
Date: Tue, 18 Sep 2012 12:53:07 GMT
}
这是header的全部信息,直接塞里面不行,只塞cookie的话,只有三个cookie,好像那个链接请求的时候,发送的cookie数是有六个的
剩余的三个不知道怎样弄出来了!
@情傷:
1.要先请求那个页面,取到http://alexa.chinaz.com/Get_Data.asp?a=26.4184890656064&b=48.1640159045726&c=37.0944333996024&d=28.2554671968191&e=26.4184890656064&f=%A1%D6%202003.98508946322&g=%A1%D6%2042083.5079522863这个地址,这个地址是动态生成的放到session里了过段时间就会失效,所以要从页面取。
2.设置Referer ,这个是后台做了判断,没有就返回空。
4.跟cookie和header没关系
5.折腾了我一个小时:(。。。
要用response.Header["Set-Cookie"]去获取 这个是正确的
response.Header["Set-Cookie"]