1 //以下是部分代码 2 List<string> Weburllist = new List<string>(); 3 List<string> Weburllistzx = new List<string>(); 4 StringBuilder weburlSB = new StringBuilder(); 5 bool IsGenxin = false; 6 MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline); 7 foreach (Match m in mcexplain) 8 { 9 Weburllist.Add(m.Value); 10 } 11 12 System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString()); 13 14 Uri uri = new Uri(sjurlDR["LinkUrl"].ToString()); 15 SetHeaderValue(newswebrequest.Headers, "Host", uri.Host); 16 SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"); 17 SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); 18 SetHeaderValue(newswebrequest.Headers, "Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); 19 SetHeaderValue(newswebrequest.Headers, "Accept-Encoding", "gzip, deflate, sdch"); 20 SetHeaderValue(newswebrequest.Headers, "Cookie:", "gscu_792856215=62888640q5c56420; _gscbrs_792856215=1"); 21 SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive"); 22 SetHeaderValue(newswebrequest.Headers, "Cache-Control", "max-age=0"); 23 24 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate, sdch"); 25 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8"); 26 //newswebrequest.Headers.Add(HttpRequestHeader.CacheControl, "max-age=0"); 27 //SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 28 //SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive"); 29 //newswebrequest.Headers.Add(HttpRequestHeader.Cookie, "_gscu_792856215=62888640q5c56420; _gscbrs_792856215=1"); 30 //SetHeaderValue(newswebrequest.Headers, "Host", "zjks.com"); 31 //SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"); 32 33 34 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse(); 35 System.IO.Stream newsstream = newswebresponse.GetResponseStream(); 36 System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8); 37 string ProductionContent = string.Empty; 38 ProductionContent = sr.ReadToEnd(); 39 sr.Close(); 40 41 Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); 42 string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value; 43 MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href= 44 45 \""+ wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline); 46 int Index = 1; 47 foreach (Match m in mc) 48 { 49 MatchCollection mc1 = Regex.Matches(m.Value.Replace("\"", "'"), @"[a-zA-z]+://[^']*", RegexOptions.Singleline); 50 if (mc1.Count > 0) 51 { 52 foreach (Match m1 in mc1) 53 { 54 string linkurlstr = string.Empty; 55 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); 56 weburlSB.Append("$-$"); 57 weburlSB.Append(linkurlstr); 58 weburlSB.Append("$_$"); 59 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) 60 { 61 IsGenxin = true; 62 Weburllistzx.Add(linkurlstr); 63 linkSb.AppendFormat("{0}<br/>", linkurlstr); 64 } 65 } 66 } 67 else 68 { 69 if (m.Value.IndexOf("javascript") == -1) 70 { 71 string amstr = string.Empty; 72 string wangzhanxiangduilujin = string.Empty; 73 wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1); 74 amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin); 75 MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline); 76 77 foreach (Match m1 in mc11) 78 { 79 string linkurlstr = string.Empty; 80 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); 81 weburlSB.Append("$-$"); 82 weburlSB.Append(linkurlstr); 83 weburlSB.Append("$_$"); 84 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) 85 { 86 IsGenxin = true; 87 Weburllistzx.Add(linkurlstr); 88 linkSb.AppendFormat("{0}<br/>", linkurlstr); 89 } 90 } 91 } 92 } 93 Index++; 94 } 95 System.Threading.Thread.Sleep(1000); 96 if (IsGenxin) 97 { 98 originlinksInfo oinfo = new originlinksInfo(); 99 oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString())); 100 oinfo.LinkContentnext = oinfo.LinkContent; 101 oinfo.LinkContent = weburlSB.ToString(); 102 originlinksLogic.Update(oinfo); 103 System.Threading.Thread.Sleep(2000); 104 } 105 106 //如http://www.zjks.com/,这个网站总是采集失败,在这句代码 107 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();//这里在采集时总是跳出
跪求思路!
是不是你后面请求没有带上前面生成的cookie又或者是请求太快了。系统拒绝了。
你调试看看,可能是你某些时候,执行请求的url是有问题的。try catch一下。
一般的网址可以采集到,但一些做了反爬的网址就采集不到直接报错,异常
@Supper_litt: 调试了!好比http://www.zjks.com/ 这个网站前边的代码可以运行,但到System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();
就直接出错!catch跳出了
报出500异常
@魇: 哦,你设置一下,请求的http版本,我看他是用的。http 1.1
@Supper_litt:
request.ProtocolVersion = HttpVersion.Version11;
记得下次问问题不要贴这么多代码,看都不会看,就说关键点就ok.
@Supper_litt:
应该是网站自身的问题,他统计了用户的很多请求header
Accept: text/html, application/xhtml+xml, */*
X-HttpWatch-RID: 41941-10009
Accept-Language: zh-CN
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Accept-Encoding: gzip, deflate
Host: www.zjks.com
Connection: Keep-Alive
你加上一些需要的header再试,我用socket测试了一下,可以拿到数据。
首先 楼主用System.Net.WebRequest 。我是不用的。 所以不熟。不知道有没有试过 htmlunit + jsoup。好用。
其次。 把你要爬取的网址给出来啊。 这样我们也可以自己写个小爬虫试着爬一下啊。。
htmlunit + jsoup 没用过! 其实一般没做反爬取的网址的内容我能够爬取内容,但做了反爬取的网址就会出异常报错,比如这个网站http://www.zjks.com/
@魇: 额。兄弟。 浙江考试网 应该不是很难爬吧。 我爬过南京工商网、江苏工商局、南京地方税务局、国家企业政策网、国家税务总局、等很多政府网站,另外还有搜狐新闻、腾讯新闻、豆丁网、豆瓣电影、百度文库等、基本都可以的。我现在在公司。 没办法做验证。 公司 有一个爬虫插件在爬。爬虫上外网需要配置代理类。 那边那台机子在爬 我这里没办法跑爬虫。 所以 必须回家才能做验证了。
对比正常请求和你的采集请求,找到差异点,进行修改。HTTP协议无状态,根本是请求本身有差异,才会被对方识别到。
大神,表示这块的内容我也是菜鸟,照着别人的路子过来的!我吧源码贴上,你看看
using System; using System.Collections.Generic; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using System.Data; using System.Text; using System.Text.RegularExpressions; using System.IO; using KaoShi110.BLL; using KaoShi110.Model; using KaoShi110.Forum; using KaoShi110.Common; public partial class WebsiteAdmin_News_Originlinklist : System.Web.UI.Page { ExamDirectory ExamDirectoryLogic = new ExamDirectory(); UAddress UAddressLogic = new UAddress(); originlinks originlinksLogic = new originlinks(); originnews originnewsLogic = new originnews(); protected void Page_Load(object sender, EventArgs e) { if (!IsPostBack) { if (KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("SuperAdmin") < 0 && KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("NewsManage") < 0) { Response.Redirect("/WebsiteAdmin/Default.aspx"); } StartDate.Text = DateTime.Now.ToString("yyyy-MM-dd"); EndDate.Text = DateTime.Now.ToString("yyyy-MM-dd"); BindDDLDirectory(); BindDDLExam(); BindProvince(); Bindurllist(); bindInfolist(); } } #region Private Property /// <summary> /// 频道ID /// </summary> public int DirectoryID { get { int i = 0; if (DDLDirectory.Visible) { int.TryParse(DDLDirectory.SelectedItem.Value, out i); } return i; } } /// <summary> /// 考试ID /// </summary> public int ExamID { get { int i = 0; if (DDLExam.Visible) { int.TryParse(DDLExam.SelectedItem.Value, out i); } return i; } } #endregion #region DataBind /// <summary> /// 绑定频道目录 /// </summary> protected void BindDDLDirectory() { DDLDirectory.Items.Clear(); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.DirectoryLevel, OperationEnum.Equal, "1"); List<int> OrderFilter = new List<int>(); ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.DirectoryOrder, SortingAction.Desc); ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.CreateDate, SortingAction.Desc); DataSet ds = ExamDirectoryLogic.GetByQuery(WhereFilter, OrderFilter); if (ds != null && ds.Tables[0].Rows.Count > 0) { DDLDirectory.DataSource = ds; DDLDirectory.DataTextField = "DirectoryName"; DDLDirectory.DataValueField = "DirectoryID"; DDLDirectory.DataBind(); } DDLDirectory.Items.Insert(0, new ListItem("公共", "888")); DDLDirectory.Items.Insert(0, new ListItem("全部", "0")); } /// <summary> /// 绑定考试栏目列表 /// </summary> protected void BindDDLExam() { DDLExam.Items.Clear(); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.DirectoryLevel, OperationEnum.Equal, "2"); ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.ParentID, OperationEnum.Equal, DirectoryID); List<int> OrderFilter = new List<int>(); ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.DirectoryOrder, SortingAction.Desc); ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.CreateDate, SortingAction.Desc); DataSet ds = ExamDirectoryLogic.GetByQuery(WhereFilter, OrderFilter); if (ds != null && ds.Tables[0].Rows.Count > 0) { DDLExam.DataSource = ds; DDLExam.DataTextField = "DirectoryName"; DDLExam.DataValueField = "DirectoryID"; DDLExam.DataBind(); } DDLExam.Items.Insert(0, new ListItem("公共", "888")); DDLExam.Items.Insert(0, new ListItem("全部", "0")); } private void BindProvince() { DDLProvince.Items.Clear(); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); UAddressLogic.EnumToDictionary(WhereFilter, UAddressInfo.Fields.IsActive, OperationEnum.Equal, true); UAddressLogic.EnumToDictionary(WhereFilter, UAddressInfo.Fields.ALevel, OperationEnum.Equal, 1); DataSet ds = UAddressLogic.GetByQuery(WhereFilter, null); if (ds != null && ds.Tables[0].Rows.Count > 0) { DDLProvince.DataSource = ds; DDLProvince.DataTextField = "AName"; DDLProvince.DataValueField = "ID"; DDLProvince.DataBind(); } DDLProvince.Items.Insert(0, new ListItem("全部", "0")); } private void Bindurllist() { ddl_urllist.Items.Clear(); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); if (DirectoryID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID); } if (ExamID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID); } if (!DDLProvince.SelectedItem.Value.Equals("0")) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value); } DataSet ds = originlinksLogic.GetByQuery(WhereFilter, null); if (ds != null && ds.Tables[0].Rows.Count > 0) { ddl_urllist.DataSource = ds; ddl_urllist.DataTextField = "WebsiteName"; ddl_urllist.DataValueField = "ID"; ddl_urllist.DataBind(); } ddl_urllist.Items.Insert(0, new ListItem("全部", "0")); } /// <summary> /// 绑定新闻列表 /// </summary> private void bindInfolist() { try { liAdd.Text = string.Format("<a href='AddOriginlink.aspx?did=" + DirectoryID + "&eid=" + ExamID + "' target='_blank'>{0}</a>", "添加网址"); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); if (DirectoryID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID); } if (ExamID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID); } if (!DDLProvince.SelectedItem.Value.Equals("0")) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value); } if (!ddl_urllist.SelectedItem.Value.Equals("0")) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value); } AspNetPager1.RecordCount = originlinksLogic.GetByQueryCount(WhereFilter); List<int> OrderFilter = new List<int>(); originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc); DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter, AspNetPager1.CurrentPageIndex, AspNetPager1.PageSize); if (ds != null && ds.Tables[0].Rows.Count > 0) { InforList.Visible = true; InforList.DataSource = ds; InforList.DataBind(); L_InforListStr.Text = string.Empty; } else { InforList.Visible = false; L_InforListStr.Text = "暂无满足条件的网址!"; } } catch { InforList.Visible = false; L_InforListStr.Text = "出错了!!!!!!!!"; } } #endregion #region Controls change protected void AspNetPager1_PageChanging(object src, Wuqi.Webdiyer.PageChangingEventArgs e) { try { AspNetPager1.CurrentPageIndex = e.NewPageIndex; bindInfolist(); } catch { } } protected void DDLDirectory_Changed(object sender, EventArgs e) { try { BindDDLExam(); AspNetPager1.CurrentPageIndex = 0; Bindurllist(); bindInfolist(); } catch { } } protected void DDLExam_Changed(object sender, EventArgs e) { try { AspNetPager1.CurrentPageIndex = 0; Bindurllist(); bindInfolist(); } catch { } } protected void DDLProvince_Changed(object sender, EventArgs e) { try { AspNetPager1.CurrentPageIndex = 0; Bindurllist(); bindInfolist(); } catch { } } #endregion #region Event /// <summary> /// 绑定数据 /// </summary> /// <param name="Sender"></param> /// <param name="e"></param> protected void RelationList_DataBound(Object Sender, RepeaterItemEventArgs e) { if (e.Item.ItemType == ListItemType.Item || e.Item.ItemType == ListItemType.AlternatingItem) { LinkButton likBtn = (LinkButton)e.Item.FindControl("LinkButton1"); if (KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("SuperAdmin") != -1 || KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("NewsDelete") != -1) { likBtn.Visible = true; } else { likBtn.Visible = false; } } } protected void RelationList_ItemCommand(object sender, RepeaterCommandEventArgs e) { if (e.CommandName == "del") //删除整条新闻 { LinkButton likBtn = (LinkButton)e.Item.FindControl("LinkButton1"); try { originlinksLogic.Delete(int.Parse(likBtn.ToolTip)); bindInfolist(); DNTRequest.RaiseTip(this.Page, "删除成功!"); } catch { DNTRequest.RaiseTip(this.Page, "删除失败!!!!!!!!"); } } } protected void BtnSearch_Click(object sender, EventArgs e) { try { InforContent.Text = string.Empty; AspNetPager1.CurrentPageIndex = 0; bindInfolist(); } catch { } } protected void BtnCaiji_Click(object sender, EventArgs e) { try { StringBuilder linkSb = new StringBuilder(); InforContent.Text = string.Empty; Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); if (DirectoryID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID); } if (ExamID > 0) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID); } if (!DDLProvince.SelectedItem.Value.Equals("0")) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value); } if (!ddl_urllist.SelectedItem.Value.Equals("0")) { originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value); } List<int> OrderFilter = new List<int>(); originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc); DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter); foreach (DataRow sjurlDR in ds.Tables[0].Rows) { List<string> Weburllist = new List<string>(); List<string> Weburllistzx = new List<string>(); StringBuilder weburlSB = new StringBuilder(); bool IsGenxin = false; //最新的内容 数据库 MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline); foreach (Match m in mcexplain) { Weburllist.Add(m.Value); } //网址方面 System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString()); System.Net.WebResponse newswebresponse = newswebrequest.GetResponse(); System.IO.Stream newsstream = newswebresponse.GetResponseStream(); System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8); string ProductionContent = string.Empty; // ProductionContent = sr.ReadToEnd(); sr.Close(); Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?"); string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value; MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline); int Index = 1; foreach (Match m in mc) { MatchCollection mc1 = Regex.Matches(m.Value, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline); if (mc1.Count > 0) { foreach (Match m1 in mc1) { string linkurlstr = string.Empty; linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); weburlSB.Append("$-$"); weburlSB.Append(linkurlstr); weburlSB.Append("$_$"); if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) { IsGenxin = true; Weburllistzx.Add(linkurlstr); linkSb.AppendFormat("{0}<br/>", linkurlstr); //Dictionary<int[], string> WhereFilter1 = new Dictionary<int[], string>(); //originnewsLogic.EnumToDictionary(WhereFilter1, originnewsInfo.Fields.linkUrl, OperationEnum.Equal, linkurlstr); //if (!originnewsLogic.IsExists(WhereFilter1)) //{ // originnewsInfo oninfo = new originnewsInfo(); // oninfo.CreateDate = DateTime.Now; // oninfo.linkUrl = linkurlstr; // int onint = originnewsLogic.Insert(oninfo); // linkSb.AppendFormat("{0}<br/>", linkurlstr); //} } } } else { if (m.Value.IndexOf("javascript") == -1) { string amstr = string.Empty; string wangzhanxiangduilujin = string.Empty; wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1); amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin); MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline); foreach (Match m1 in mc11) { string linkurlstr = string.Empty; linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", ""); weburlSB.Append("$-$"); weburlSB.Append(linkurlstr); weburlSB.Append("$_$"); if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr)) { IsGenxin = true; Weburllistzx.Add(linkurlstr); linkSb.AppendFormat("{0}<br/>", linkurlstr); //Dictionary<int[], string> WhereFilter1 = new Dictionary<int[], string>(); //originnewsLogic.EnumToDictionary(WhereFilter1, originnewsInfo.Fields.linkUrl, OperationEnum.Equal, linkurlstr); //if (!originnewsLogic.IsExists(WhereFilter1)) //{ // originnewsInfo oninfo = new originnewsInfo(); // oninfo.CreateDate = DateTime.Now; // oninfo.linkUrl = linkurlstr; // int onint = originnewsLogic.Insert(oninfo); // linkSb.AppendFormat("{0}<br/>", linkurlstr); //} } } } } Index++; } System.Threading.Thread.Sleep(1000); if (IsGenxin) { originlinksInfo oinfo = new originlinksInfo(); oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString())); oinfo.LinkContentnext = oinfo.LinkContent; oinfo.LinkContent = weburlSB.ToString(); originlinksLogic.Update(oinfo); System.Threading.Thread.Sleep(2000); } } InforContent.Text = linkSb.ToString(); DNTRequest.RaiseTip(this.Page, "采集成功!"); } catch (Exception ex) { DNTRequest.RaiseTip(this.Page, "出错了,采集失败!!!!!!!!!"); } } //protected void BtnSCaiji_Click(object sender, EventArgs e) //{ // try // { // StringBuilder linkSb = new StringBuilder(); // InforContent.Text = string.Empty; // Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); // if (DirectoryID > 0) // { // originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID); // } // if (ExamID > 0) // { // originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID); // } // if (!DDLProvince.SelectedItem.Value.Equals("0")) // { // originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value); // } // if (!ddl_urllist.SelectedItem.Value.Equals("0")) // { // originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value); // } // List<int> OrderFilter = new List<int>(); // originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc); // DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter); // foreach (DataRow sjurlDR in ds.Tables[0].Rows) // { // List<string> Weburllist = new List<string>(); // List<string> Weburllistzx = new List<string>(); // StringBuilder weburlSB = new StringBuilder(); // MatchCollection mcexplainzx = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline); // MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContentnext"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline); // foreach (Match m in mcexplain) // { // Weburllist.Add(m.Value); // } // foreach (Match m in mcexplainzx) // { // if (!Weburllist.Contains(m.Value) && !Weburllistzx.Contains(m.Value)) // { // Weburllistzx.Add(m.Value); // linkSb.AppendFormat("{0}<br/>", m.Value); // } // } // } // InforContent.Text = linkSb.ToString(); // } // catch // { } //} protected void BtnSearchcaiji_Click(object sender, EventArgs e) { try { InforContent.Text = string.Empty; StringBuilder linkSb = new StringBuilder(); Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>(); originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.CreateDate, OperationEnum.MorethanEqual, StartDate.Text); originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.CreateDate, OperationEnum.LessThanEqual, DateTime.Parse(EndDate.Text).AddDays(1)); if (!string.IsNullOrEmpty(tb_link.Text.Trim())) { originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.linkUrl, OperationEnum.Like, tb_link.Text.Trim()); } List<int> OrderFilter = new List<int>(); originnewsLogic.EnumToOrderByEnum(OrderFilter, originnewsInfo.Fields.ID, SortingAction.Desc); DataSet ds = originnewsLogic.GetByQuery(WhereFilter, OrderFilter); foreach (DataRow sjurlDR in ds.Tables[0].Rows) { linkSb.AppendFormat("{0}<br/>", sjurlDR["linkUrl"]); } InforContent.Text = linkSb.ToString(); } catch { } } #endregion }
主要的方法BtnCaiji_Click 这里面
@魇: 要直接对比两个请求,你发代码也没用。。。
你直接改cookie怕是不算登录吧?
wo也不晓得,菜鸟求大神指点!
写爬虫的童鞋可以试试神箭手云爬虫,自带JS渲染、代理ip、验证码识别等功能,还可以发布和导出爬取的数据,生成图表等,都在云端进行,不需要安装开发环境。
开发文档参考:http://doc.shenjianshou.cn/