接到个工作任务需求是这样的,做一个简单的爬虫自动采集网站信息,但是某个网站会有IP时间限制,所以短时间内采集了10页后,就得切换一个IP,单线程采集就非常慢,但是我发现这样的多线程加了lock后相当于单线程,非常非常的慢,我本来想把lock去掉,速度是变快了,但是有个问题,自动拨号换IP差不多也要15秒左右时间,但是有些线程正在网络请求中,如果子线程请求的时间都低于15秒,那切换网络IP的时候还比较正常,但是比如子线程3的一次网络请求,如果这个网页比较复杂,可能要下载30秒,但是后面的子线程如果不加LOCK的话,会接着自动往前跑,这样,那个切换IP的方法下载的其实就是有问题的,会导致下载的过程当中,下载动作的IP是乱的。我最终目的就是想 当子线程设定为5的时候,就把5个线程全部跑完了,其它的就等待在那儿,直到IP切换完成,才接着跑后面的5个个线程,周而复始这样。不知道我有没有表达清楚我的意思。
下面是我写的代码,请各位专家帮我把把脉,谢谢各位。
程序启动代码:
class Program
{
static void Main(string[] args)
{
DownThreadPool down = new DownThreadPool();
DownloadPageItem item;
for (int i = 0; i < 100; i++)
{
item = new DownloadPageItem();
item.URL = "http://" + i.ToString()+".com";
down.addTask(item);
}
down.Start(5, 6);
Console.Read();
}
}
下载的多线程代码
public class DownThreadPool
{
/// <summary>
/// 网址列表
/// </summary>
private DownloadPageList list = new DownloadPageList();
/// <summary>
/// 最大线程数
/// </summary>
private int maxThreadNum = 0;
/// <summary>
/// 步长数
/// </summary>
private int maxStepNum = 0;
/// <summary>
/// 当前执行的步长
/// </summary>
private int executeStepNum = 0;
/// <summary>
/// 线程数组
/// </summary>
private Thread[] threads = null;
/// <summary>
/// 随机数
/// </summary>
private static Random rnd = new Random();
/// <summary>
/// 锁对象
/// </summary>
private object o = new object();
/// <summary>
/// 添加帖子
/// </summary>
/// <param name="item"></param>
public void addTask(DownloadPageItem item)
{
lock (o)
{
list.Add(item);
}
}
/// <summary>
/// 修改IP
/// </summary>
private void changeIP()
{
Console.WriteLine("自动拨号获取新IP,延时10秒");
Thread.Sleep(10 * 1000);
}
/// <summary>
/// 线程执行的内容
/// </summary>
private void threadExecute(object info)
{
while (true)
{
DownloadPageItem post = null;
lock (o)
{
if (executeStepNum % maxStepNum == 0)
{
executeStepNum = 0;
this.changeIP();
}
int maxCount = list.Count - 1;
if (maxCount >= 0)
{
post = list[maxCount];
//从列表中移除该对象
list.RemoveAt(maxCount);
}
//步长加1
executeStepNum++;
if (post != null)
{
int m = rnd.Next(10, 20);
Thread.Sleep(m * 1000);
Console.WriteLine(string.Format("{0} 正在发表帖子 {1} 网络请求{2}秒 ", Thread.CurrentThread.Name, post.URL, m));
}
maxCount = list.Count - 1;
if (maxCount < 0)
{
Console.WriteLine(string.Format("{0} 结束", Thread.CurrentThread.Name));
break;
}
}
Thread.Sleep(20);
}
}
/// <summary>
/// 启动线程
/// </summary>
/// <param name="threadNum">线程数</param>
/// <param name="stepNum">每隔多少步数就切换一次IP</param>
public void Start(int threadNum,int stepNum)
{
if (threadNum <= 0)
{
throw new Exception("线程数必须大于0");
}
if (stepNum <= 0)
{
throw new Exception("步长数必须大于0");
}
if (list.Count <= 0)
{
throw new Exception("任务数必须大于1项");
}
//如果线程数大过总的任务数,则将线程数设为总的任务数
if (threadNum > list.Count)
{
threadNum = list.Count;
}
//如果步长数大过总的任务数,则将步长数设为总的任务数
if (stepNum > list.Count)
{
stepNum = list.Count;
}
this.maxThreadNum = threadNum;
this.maxStepNum = stepNum;
//初始化线程数
threads = new Thread[this.maxThreadNum];
for (int i = 0; i < threads.Length; i++)
{
threads[i] = new Thread(new ParameterizedThreadStart(this.threadExecute));
threads[i].IsBackground = false;
threads[i].Name = string.Format("线程{0}", i);
threads[i].Start(i);
}
}
}
/// <summary>
/// 帖子
/// </summary>
public class DownloadPageItem
{
/// <summary>
/// 帖子标题
/// </summary>
public string URL { get; set; }
}
/// <summary>
/// 帖子集合
/// </summary>
public class DownloadPageList:List<DownloadPageItem>
{
}
while true改为while(xxManualResetEvent.WaitOne())
在你换ip的时候你把这个ManualResetEvent的实例给reset下,换完再set下就可以了
主线程分发ip,子线程执行任务