首页 新闻 搜索 专区 学院

采集反爬虫网站,就是采集网址的更新内容的链接,采集不到内容!

-1
悬赏园豆:100 [已解决问题] 解决于 2016-05-19 11:53
  1 //以下是部分代码
  2  List<string> Weburllist = new List<string>();
  3                 List<string> Weburllistzx = new List<string>();
  4                 StringBuilder weburlSB = new StringBuilder();
  5                 bool IsGenxin = false;
  6                 MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
  7                 foreach (Match m in mcexplain)
  8                 {
  9                     Weburllist.Add(m.Value);
 10                 }               
 11 
 12                 System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString());
 13 
 14                 Uri uri = new Uri(sjurlDR["LinkUrl"].ToString());
 15                 SetHeaderValue(newswebrequest.Headers, "Host", uri.Host);
 16                 SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0");
 17                 SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
 18                 SetHeaderValue(newswebrequest.Headers, "Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
 19                 SetHeaderValue(newswebrequest.Headers, "Accept-Encoding", "gzip, deflate, sdch");
 20                 SetHeaderValue(newswebrequest.Headers, "Cookie:", "gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
 21                 SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
 22                 SetHeaderValue(newswebrequest.Headers, "Cache-Control", "max-age=0");
 23 
 24                 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate, sdch");
 25                 //newswebrequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8");
 26                 //newswebrequest.Headers.Add(HttpRequestHeader.CacheControl, "max-age=0");
 27                 //SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 28                 //SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
 29                 //newswebrequest.Headers.Add(HttpRequestHeader.Cookie, "_gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
 30                 //SetHeaderValue(newswebrequest.Headers, "Host", "zjks.com");
 31                 //SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36");
 32 
 33 
 34                 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();
 35                 System.IO.Stream newsstream = newswebresponse.GetResponseStream();
 36                 System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8);
 37                 string ProductionContent = string.Empty;
 38                 ProductionContent = sr.ReadToEnd();
 39                 sr.Close();
 40 
 41                 Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
 42                 string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value;
 43                 MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href=
 44 
 45 \""+ wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline);
 46                 int Index = 1;
 47                 foreach (Match m in mc)
 48                 {
 49                     MatchCollection mc1 = Regex.Matches(m.Value.Replace("\"", "'"), @"[a-zA-z]+://[^']*", RegexOptions.Singleline);
 50                     if (mc1.Count > 0)
 51                     {
 52                         foreach (Match m1 in mc1)
 53                         {
 54                             string linkurlstr = string.Empty;
 55                             linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
 56                             weburlSB.Append("$-$");
 57                             weburlSB.Append(linkurlstr);
 58                             weburlSB.Append("$_$");
 59                             if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
 60                             {
 61                                 IsGenxin = true;
 62                                 Weburllistzx.Add(linkurlstr);
 63                                 linkSb.AppendFormat("{0}<br/>", linkurlstr);
 64                             }
 65                         }
 66                     }
 67                     else
 68                     {
 69                         if (m.Value.IndexOf("javascript") == -1)
 70                         {
 71                             string amstr = string.Empty;
 72                             string wangzhanxiangduilujin = string.Empty;
 73                             wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1);
 74                             amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);
 75                             MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);
 76 
 77                             foreach (Match m1 in mc11)
 78                             {
 79                                 string linkurlstr = string.Empty;
 80                                 linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
 81                                 weburlSB.Append("$-$");
 82                                 weburlSB.Append(linkurlstr);
 83                                 weburlSB.Append("$_$");
 84                                 if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
 85                                 {
 86                                     IsGenxin = true;
 87                                     Weburllistzx.Add(linkurlstr);
 88                                     linkSb.AppendFormat("{0}<br/>", linkurlstr);
 89                                 }
 90                             }
 91                         }
 92                     }
 93                     Index++;
 94                 }
 95                 System.Threading.Thread.Sleep(1000);
 96                 if (IsGenxin)
 97                 {
 98                     originlinksInfo oinfo = new originlinksInfo();
 99                     oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString()));
100                     oinfo.LinkContentnext = oinfo.LinkContent;
101                     oinfo.LinkContent = weburlSB.ToString();
102                     originlinksLogic.Update(oinfo);
103                     System.Threading.Thread.Sleep(2000);
104                 }
105                     
106 //如http://www.zjks.com/,这个网站总是采集失败,在这句代码
107 System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();//这里在采集时总是跳出

跪求思路!

快舔包我很肥的主页 快舔包我很肥 | 初学一级 | 园豆:69
提问于:2016-05-11 22:22
< >
分享
最佳答案
1

是不是你后面请求没有带上前面生成的cookie又或者是请求太快了。系统拒绝了。

收获园豆:70
Supper_litt | 小虾三级 |园豆:783 | 2016-05-11 22:27

你调试看看,可能是你某些时候,执行请求的url是有问题的。try catch一下。

Supper_litt | 园豆:783 (小虾三级) | 2016-05-11 22:29

一般的网址可以采集到,但一些做了反爬的网址就采集不到直接报错,异常

快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:04

@Supper_litt: 调试了!好比http://www.zjks.com/  这个网站前边的代码可以运行,但到System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();

就直接出错!catch跳出了

快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:06

报出500异常

快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:07

@魇: 哦,你设置一下,请求的http版本,我看他是用的。http 1.1

Supper_litt | 园豆:783 (小虾三级) | 2016-05-12 11:23

@Supper_litt: 

request.ProtocolVersion = HttpVersion.Version11;

记得下次问问题不要贴这么多代码,看都不会看,就说关键点就ok.

Supper_litt | 园豆:783 (小虾三级) | 2016-05-12 11:28

@Supper_litt: 

应该是网站自身的问题,他统计了用户的很多请求header

Accept: text/html, application/xhtml+xml, */*
X-HttpWatch-RID: 41941-10009
Accept-Language: zh-CN
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Accept-Encoding: gzip, deflate
Host: www.zjks.com
Connection: Keep-Alive

你加上一些需要的header再试,我用socket测试了一下,可以拿到数据。

Supper_litt | 园豆:783 (小虾三级) | 2016-05-12 11:36
其他回答(4)
0

首先  楼主用System.Net.WebRequest 。我是不用的。  所以不熟。不知道有没有试过 htmlunit + jsoup。好用。

其次。 把你要爬取的网址给出来啊。 这样我们也可以自己写个小爬虫试着爬一下啊。。

收获园豆:5
情不知所起一往而深 | 园豆:87 (初学一级) | 2016-05-12 08:49

htmlunit + jsoup  没用过!    其实一般没做反爬取的网址的内容我能够爬取内容,但做了反爬取的网址就会出异常报错,比如这个网站http://www.zjks.com/

支持(0) 反对(0) 快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:19

@魇: 额。兄弟。  浙江考试网 应该不是很难爬吧。 我爬过南京工商网、江苏工商局、南京地方税务局、国家企业政策网、国家税务总局、等很多政府网站,另外还有搜狐新闻、腾讯新闻、豆丁网、豆瓣电影、百度文库等、基本都可以的。我现在在公司。 没办法做验证。 公司 有一个爬虫插件在爬。爬虫上外网需要配置代理类。 那边那台机子在爬 我这里没办法跑爬虫。 所以 必须回家才能做验证了。

0

对比正常请求和你的采集请求,找到差异点,进行修改。HTTP协议无状态,根本是请求本身有差异,才会被对方识别到。

收获园豆:20
幻天芒 | 园豆:36594 (高人七级) | 2016-05-12 09:06

大神,表示这块的内容我也是菜鸟,照着别人的路子过来的!我吧源码贴上,你看看

using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Data;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;

using KaoShi110.BLL;
using KaoShi110.Model;
using KaoShi110.Forum;
using KaoShi110.Common;

public partial class WebsiteAdmin_News_Originlinklist : System.Web.UI.Page
{
    ExamDirectory ExamDirectoryLogic = new ExamDirectory();
    UAddress UAddressLogic = new UAddress();
    originlinks originlinksLogic = new originlinks();
    originnews originnewsLogic = new originnews();

    protected void Page_Load(object sender, EventArgs e)
    {
        if (!IsPostBack)
        {
            if (KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("SuperAdmin") < 0 && KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("NewsManage") < 0)
            {
                Response.Redirect("/WebsiteAdmin/Default.aspx");
            }
            StartDate.Text = DateTime.Now.ToString("yyyy-MM-dd");
            EndDate.Text = DateTime.Now.ToString("yyyy-MM-dd");
            BindDDLDirectory();
            BindDDLExam();
            BindProvince();
            Bindurllist();
            bindInfolist();
        }
    }

    #region Private Property

    /// <summary>
    /// 频道ID
    /// </summary>
    public int DirectoryID
    {
        get
        {
            int i = 0;
            if (DDLDirectory.Visible)
            {
                int.TryParse(DDLDirectory.SelectedItem.Value, out i);
            }
            return i;
        }
    }

    /// <summary>
    /// 考试ID
    /// </summary>
    public int ExamID
    {
        get
        {
            int i = 0;
            if (DDLExam.Visible)
            {
                int.TryParse(DDLExam.SelectedItem.Value, out i);
            }
            return i;
        }
    }

    #endregion

    #region DataBind

    /// <summary>
    /// 绑定频道目录
    /// </summary>
    protected void BindDDLDirectory()
    {
        DDLDirectory.Items.Clear();
        Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
        ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.DirectoryLevel, OperationEnum.Equal, "1");
        List<int> OrderFilter = new List<int>();
        ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.DirectoryOrder, SortingAction.Desc);
        ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.CreateDate, SortingAction.Desc);
        DataSet ds = ExamDirectoryLogic.GetByQuery(WhereFilter, OrderFilter);
        if (ds != null && ds.Tables[0].Rows.Count > 0)
        {
            DDLDirectory.DataSource = ds;
            DDLDirectory.DataTextField = "DirectoryName";
            DDLDirectory.DataValueField = "DirectoryID";
            DDLDirectory.DataBind();
        }
        DDLDirectory.Items.Insert(0, new ListItem("公共", "888"));
        DDLDirectory.Items.Insert(0, new ListItem("全部", "0"));
    }

    /// <summary>
    /// 绑定考试栏目列表
    /// </summary>
    protected void BindDDLExam()
    {
        DDLExam.Items.Clear();
        Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
        ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.DirectoryLevel, OperationEnum.Equal, "2");
        ExamDirectoryLogic.EnumToDictionary(WhereFilter, ExamDirectoryInfo.Fields.ParentID, OperationEnum.Equal, DirectoryID);
        List<int> OrderFilter = new List<int>();
        ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.DirectoryOrder, SortingAction.Desc);
        ExamDirectoryLogic.EnumToOrderByEnum(OrderFilter, ExamDirectoryInfo.Fields.CreateDate, SortingAction.Desc);
        DataSet ds = ExamDirectoryLogic.GetByQuery(WhereFilter, OrderFilter);
        if (ds != null && ds.Tables[0].Rows.Count > 0)
        {
            DDLExam.DataSource = ds;
            DDLExam.DataTextField = "DirectoryName";
            DDLExam.DataValueField = "DirectoryID";
            DDLExam.DataBind();
        }
        DDLExam.Items.Insert(0, new ListItem("公共", "888"));
        DDLExam.Items.Insert(0, new ListItem("全部", "0"));
    }

    private void BindProvince()
    {
        DDLProvince.Items.Clear();
        Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
        UAddressLogic.EnumToDictionary(WhereFilter, UAddressInfo.Fields.IsActive, OperationEnum.Equal, true);
        UAddressLogic.EnumToDictionary(WhereFilter, UAddressInfo.Fields.ALevel, OperationEnum.Equal, 1);
        DataSet ds = UAddressLogic.GetByQuery(WhereFilter, null);
        if (ds != null && ds.Tables[0].Rows.Count > 0)
        {
            DDLProvince.DataSource = ds;
            DDLProvince.DataTextField = "AName";
            DDLProvince.DataValueField = "ID";
            DDLProvince.DataBind();
        }
        DDLProvince.Items.Insert(0, new ListItem("全部", "0"));
    }

    private void Bindurllist()
    {
        ddl_urllist.Items.Clear();
        Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
        if (DirectoryID > 0)
        {
            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID);
        }
        if (ExamID > 0)
        {
            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID);
        }
        if (!DDLProvince.SelectedItem.Value.Equals("0"))
        {
            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value);
        }
        DataSet ds = originlinksLogic.GetByQuery(WhereFilter, null);
        if (ds != null && ds.Tables[0].Rows.Count > 0)
        {
            ddl_urllist.DataSource = ds;
            ddl_urllist.DataTextField = "WebsiteName";
            ddl_urllist.DataValueField = "ID";
            ddl_urllist.DataBind();
        }
        ddl_urllist.Items.Insert(0, new ListItem("全部", "0"));
    }

    /// <summary>
    /// 绑定新闻列表
    /// </summary>
    private void bindInfolist()
    {
        try
        {
            liAdd.Text = string.Format("<a href='AddOriginlink.aspx?did=" + DirectoryID + "&eid=" + ExamID + "' target='_blank'>{0}</a>", "添加网址");
            Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();

            if (DirectoryID > 0)
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID);
            }
            if (ExamID > 0)
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID);
            }
            if (!DDLProvince.SelectedItem.Value.Equals("0"))
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value);
            }
            if (!ddl_urllist.SelectedItem.Value.Equals("0"))
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value);
            }
            AspNetPager1.RecordCount = originlinksLogic.GetByQueryCount(WhereFilter);
            List<int> OrderFilter = new List<int>();
            originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc);
            DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter, AspNetPager1.CurrentPageIndex, AspNetPager1.PageSize);
            if (ds != null && ds.Tables[0].Rows.Count > 0)
            {
                InforList.Visible = true;
                InforList.DataSource = ds;
                InforList.DataBind();
                L_InforListStr.Text = string.Empty;
            }
            else
            {
                InforList.Visible = false;
                L_InforListStr.Text = "暂无满足条件的网址!";
            }
        }
        catch
        {
            InforList.Visible = false;
            L_InforListStr.Text = "出错了!!!!!!!!";
        }
    }

    #endregion

    #region Controls change

    protected void AspNetPager1_PageChanging(object src, Wuqi.Webdiyer.PageChangingEventArgs e)
    {
        try
        {
            AspNetPager1.CurrentPageIndex = e.NewPageIndex;
            bindInfolist();
        }
        catch
        { }
    }

    protected void DDLDirectory_Changed(object sender, EventArgs e)
    {
        try
        {
            BindDDLExam();
            AspNetPager1.CurrentPageIndex = 0;
            Bindurllist();
            bindInfolist();
        }
        catch
        { }
    }

    protected void DDLExam_Changed(object sender, EventArgs e)
    {
        try
        {
            AspNetPager1.CurrentPageIndex = 0;
            Bindurllist();
            bindInfolist();
        }
        catch
        { }
    }

    protected void DDLProvince_Changed(object sender, EventArgs e)
    {
        try
        {
            AspNetPager1.CurrentPageIndex = 0;
            Bindurllist();
            bindInfolist();
        }
        catch
        { }
    }
    #endregion

    #region Event
    /// <summary>
    /// 绑定数据
    /// </summary>
    /// <param name="Sender"></param>
    /// <param name="e"></param>
    protected void RelationList_DataBound(Object Sender, RepeaterItemEventArgs e)
    {
        if (e.Item.ItemType == ListItemType.Item || e.Item.ItemType == ListItemType.AlternatingItem)
        {
            LinkButton likBtn = (LinkButton)e.Item.FindControl("LinkButton1");
            if (KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("SuperAdmin") != -1 || KaoShi110.Forum.ForumUtils.GetCookie("roles").IndexOf("NewsDelete") != -1)
            {
                likBtn.Visible = true;
            }
            else
            {
                likBtn.Visible = false;
            }
        }
    }

    protected void RelationList_ItemCommand(object sender, RepeaterCommandEventArgs e)
    {
        if (e.CommandName == "del")   //删除整条新闻
        {
            LinkButton likBtn = (LinkButton)e.Item.FindControl("LinkButton1");
            try
            {
                originlinksLogic.Delete(int.Parse(likBtn.ToolTip));
                bindInfolist();
                DNTRequest.RaiseTip(this.Page, "删除成功!");
            }
            catch
            {
                DNTRequest.RaiseTip(this.Page, "删除失败!!!!!!!!");
            }
        }
    }

    protected void BtnSearch_Click(object sender, EventArgs e)
    {
        try
        {
            InforContent.Text = string.Empty;
            AspNetPager1.CurrentPageIndex = 0;
            bindInfolist();
        }
        catch
        { }
    }

    protected void BtnCaiji_Click(object sender, EventArgs e)
    {
        try
        {
            StringBuilder linkSb = new StringBuilder();
            InforContent.Text = string.Empty;
            Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
            if (DirectoryID > 0)
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID);
            }
            if (ExamID > 0)
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID);
            }
            if (!DDLProvince.SelectedItem.Value.Equals("0"))
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value);
            }
            if (!ddl_urllist.SelectedItem.Value.Equals("0"))
            {
                originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value);
            }
            List<int> OrderFilter = new List<int>();
            originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc);
            DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter);
            foreach (DataRow sjurlDR in ds.Tables[0].Rows)
            {
                List<string> Weburllist = new List<string>();
                List<string> Weburllistzx = new List<string>();
                StringBuilder weburlSB = new StringBuilder();
                bool IsGenxin = false;
                //最新的内容   数据库
                MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
                foreach (Match m in mcexplain)
                {
                    Weburllist.Add(m.Value);
                }
                //网址方面
                System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR["LinkUrl"].ToString());
                System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();
                System.IO.Stream newsstream = newswebresponse.GetResponseStream();
                System.IO.StreamReader sr = new StreamReader(newsstream, System.Text.Encoding.UTF8);
                string ProductionContent = string.Empty;
                //
                ProductionContent = sr.ReadToEnd();


                sr.Close();
                Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
                string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value;
                MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline);
                int Index = 1;
                foreach (Match m in mc)
                {
                    MatchCollection mc1 = Regex.Matches(m.Value, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);
                    if (mc1.Count > 0)
                    {
                        foreach (Match m1 in mc1)
                        {
                            string linkurlstr = string.Empty;
                            linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
                            weburlSB.Append("$-$");
                            weburlSB.Append(linkurlstr);
                            weburlSB.Append("$_$");
                            if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
                            {
                                IsGenxin = true;
                                Weburllistzx.Add(linkurlstr);
                                linkSb.AppendFormat("{0}<br/>", linkurlstr);
                                //Dictionary<int[], string> WhereFilter1 = new Dictionary<int[], string>();
                                //originnewsLogic.EnumToDictionary(WhereFilter1, originnewsInfo.Fields.linkUrl, OperationEnum.Equal, linkurlstr);
                                //if (!originnewsLogic.IsExists(WhereFilter1))
                                //{
                                //    originnewsInfo oninfo = new originnewsInfo();
                                //    oninfo.CreateDate = DateTime.Now;
                                //    oninfo.linkUrl = linkurlstr;
                                //    int onint = originnewsLogic.Insert(oninfo);
                                //    linkSb.AppendFormat("{0}<br/>", linkurlstr);
                                //}
                            }
                        }
                    }
                    else
                    {
                        if (m.Value.IndexOf("javascript") == -1)
                        {
                            string amstr = string.Empty;
                            string wangzhanxiangduilujin = string.Empty;
                            wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0, sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1);
                            amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);
                            MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);

                            foreach (Match m1 in mc11)
                            {
                                string linkurlstr = string.Empty;
                                linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
                                weburlSB.Append("$-$");
                                weburlSB.Append(linkurlstr);
                                weburlSB.Append("$_$");
                                if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
                                {
                                    IsGenxin = true;
                                    Weburllistzx.Add(linkurlstr);
                                    linkSb.AppendFormat("{0}<br/>", linkurlstr);
                                    //Dictionary<int[], string> WhereFilter1 = new Dictionary<int[], string>();
                                    //originnewsLogic.EnumToDictionary(WhereFilter1, originnewsInfo.Fields.linkUrl, OperationEnum.Equal, linkurlstr);
                                    //if (!originnewsLogic.IsExists(WhereFilter1))
                                    //{
                                    //    originnewsInfo oninfo = new originnewsInfo();
                                    //    oninfo.CreateDate = DateTime.Now;
                                    //    oninfo.linkUrl = linkurlstr;
                                    //    int onint = originnewsLogic.Insert(oninfo);
                                    //    linkSb.AppendFormat("{0}<br/>", linkurlstr);
                                    //}
                                }
                            }
                        }
                    }
                    Index++;
                }
                System.Threading.Thread.Sleep(1000);
                if (IsGenxin)
                {
                    originlinksInfo oinfo = new originlinksInfo();
                    oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString()));
                    oinfo.LinkContentnext = oinfo.LinkContent;
                    oinfo.LinkContent = weburlSB.ToString();
                    originlinksLogic.Update(oinfo);
                    System.Threading.Thread.Sleep(2000);
                }
            }
            InforContent.Text = linkSb.ToString();
            DNTRequest.RaiseTip(this.Page, "采集成功!");
        }
        catch (Exception ex)
        {
            DNTRequest.RaiseTip(this.Page, "出错了,采集失败!!!!!!!!!");
        }
    }

    //protected void BtnSCaiji_Click(object sender, EventArgs e)
    //{
    //    try
    //    {
    //        StringBuilder linkSb = new StringBuilder();
    //        InforContent.Text = string.Empty;
    //        Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
    //        if (DirectoryID > 0)
    //        {
    //            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DirectoryID, OperationEnum.Equal, DirectoryID);
    //        }
    //        if (ExamID > 0)
    //        {
    //            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ExamID, OperationEnum.Equal, ExamID);
    //        }
    //        if (!DDLProvince.SelectedItem.Value.Equals("0"))
    //        {
    //            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.DiquID, OperationEnum.Equal, DDLProvince.SelectedItem.Value);
    //        }
    //        if (!ddl_urllist.SelectedItem.Value.Equals("0"))
    //        {
    //            originlinksLogic.EnumToDictionary(WhereFilter, originlinksInfo.Fields.ID, OperationEnum.Equal, ddl_urllist.SelectedItem.Value);
    //        }
    //        List<int> OrderFilter = new List<int>();
    //        originlinksLogic.EnumToOrderByEnum(OrderFilter, originlinksInfo.Fields.ID, SortingAction.Desc);
    //        DataSet ds = originlinksLogic.GetByQuery(WhereFilter, OrderFilter);
    //        foreach (DataRow sjurlDR in ds.Tables[0].Rows)
    //        {
    //            List<string> Weburllist = new List<string>();
    //            List<string> Weburllistzx = new List<string>();
    //            StringBuilder weburlSB = new StringBuilder();
    //            MatchCollection mcexplainzx = Regex.Matches(sjurlDR["LinkContent"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
    //            MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContentnext"].ToString(), @"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
    //            foreach (Match m in mcexplain)
    //            {
    //                Weburllist.Add(m.Value);
    //            }
    //            foreach (Match m in mcexplainzx)
    //            {
    //                if (!Weburllist.Contains(m.Value) && !Weburllistzx.Contains(m.Value))
    //                {
    //                    Weburllistzx.Add(m.Value);
    //                    linkSb.AppendFormat("{0}<br/>", m.Value);
    //                }
    //            }
    //        }
    //        InforContent.Text = linkSb.ToString();
    //    }
    //    catch
    //    { }
    //}

    protected void BtnSearchcaiji_Click(object sender, EventArgs e)
    {
        try
        {
            InforContent.Text = string.Empty;
            StringBuilder linkSb = new StringBuilder();
            Dictionary<int[], string> WhereFilter = new Dictionary<int[], string>();
            originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.CreateDate, OperationEnum.MorethanEqual, StartDate.Text);
            originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.CreateDate, OperationEnum.LessThanEqual, DateTime.Parse(EndDate.Text).AddDays(1));
            if (!string.IsNullOrEmpty(tb_link.Text.Trim()))
            {
                originnewsLogic.EnumToDictionary(WhereFilter, originnewsInfo.Fields.linkUrl, OperationEnum.Like, tb_link.Text.Trim());
            }
            List<int> OrderFilter = new List<int>();
            originnewsLogic.EnumToOrderByEnum(OrderFilter, originnewsInfo.Fields.ID, SortingAction.Desc);
            DataSet ds = originnewsLogic.GetByQuery(WhereFilter, OrderFilter);
            foreach (DataRow sjurlDR in ds.Tables[0].Rows)
            {
                linkSb.AppendFormat("{0}<br/>", sjurlDR["linkUrl"]);
            }
            InforContent.Text = linkSb.ToString();
        }
        catch
        { }
    }
    #endregion

}

 

支持(0) 反对(0) 快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:22

主要的方法BtnCaiji_Click 这里面

支持(0) 反对(0) 快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:23

@魇: 要直接对比两个请求,你发代码也没用。。。

支持(1) 反对(0) 幻天芒 | 园豆:36594 (高人七级) | 2016-05-12 16:48
0

你直接改cookie怕是不算登录吧?

收获园豆:5
刘宏玺 | 园豆:14004 (专家六级) | 2016-05-12 09:41

wo也不晓得,菜鸟求大神指点!

支持(0) 反对(0) 快舔包我很肥 | 园豆:69 (初学一级) | 2016-05-12 10:25
0

写爬虫的童鞋可以试试神箭手云爬虫,自带JS渲染、代理ip、验证码识别等功能,还可以发布和导出爬取的数据,生成图表等,都在云端进行,不需要安装开发环境。

开发文档参考:http://doc.shenjianshou.cn/

sosozzzx | 园豆:302 (菜鸟二级) | 2016-08-18 16:53
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册