1 package cn.xls.util; 2 3 import cn.xls.pojo.City; 4 import org.jsoup.Jsoup; 5 import org.jsoup.nodes.Document; 6 import org.jsoup.nodes.Element; 7 import org.jsoup.select.Elements; 8 9 import java.io.IOException; 10 import java.util.ArrayList; 11 import java.util.List; 12 13 /** 14 * @program: 15 * @description: 获取城市信息 16 * @author: l 17 */ 18 public class CityInfoUtil { 19 20 public static List<City> getProvinceData(String requestUrl) { 21 List<City> lists = new ArrayList<City>(); 22 try { 23 Document document = Jsoup.connect(requestUrl).timeout(50000).maxBodySize(0).get(); 24 //获取所有的省份 25 Elements provinceList = document.select("tr[class='provincetr']").select("td").select("a"); 26 //遍历省份 27 for (Element element : provinceList) { 28 //拼接当前省份下的城市地址 29 String url1 = requestUrl.replace("index.html", element.attr("href")); 30 Document document1 = Jsoup.connect(url1).timeout(50000).maxBodySize(0).get(); 31 32 //获取该省份下所有城市(第一列为城市区号,第二列为城市名) 33 Elements citys = document1.select("tr[class='citytr']").select("td:eq(1)").select("a"); 34 String province = element.html(); 35 System.out.println("当前省份 : " + province); 36 System.out.println("当前省份下有 : " + citys.size() + " 个城市"); 37 //遍历城市 38 for (Element element1 : citys) { 39 // System.out.println("城市 : " + element1.html()); 40 //拼接当前城市下的区域地址 41 String url2 = requestUrl.replace("index.html", element1.attr("href")); 42 Document document2 = Jsoup.connect(url2).timeout(50000).maxBodySize(0).get(); 43 44 //获取该城市下所有区域(第一列为区域区号,第二列为区域名) 45 Elements areas = document2.select("tr[class=countytr] td:eq(1)").select("a"); 46 String city = element1.html(); 47 System.out.println("当前城市 : " + city); 48 System.out.println("当前城市下有 : " + areas.size() + " 个区"); 49 for (Element element2 : areas) { 50 System.out.println("区域 : " + element2.html()); 51 } 52 } 53 System.out.println("-----------"); 54 } 55 } catch (IOException e) { 56 e.printStackTrace(); 57 } 58 return lists; 59 } 60 61 //测试 62 public static void main(String[] args) { 63 List<City> lists = getProvinceData("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html"); 64 } 65 }
区域 : 呼和浩特金海工业园区 区域 : 呼和浩特经济技术开发区 城市 : 包头市 java.net.SocketTimeoutException: Read timed out at java.net.SocketInputStream.socketRead0(Native Method) at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) at java.net.SocketInputStream.read(SocketInputStream.java:170) at java.net.SocketInputStream.read(SocketInputStream.java:141) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) at java.io.BufferedInputStream.read(BufferedInputStream.java:345) at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:704) at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647) at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1535) at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1440) at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:750) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at cn.xls.util.CityInfoUtil.getProvinceData(CityInfoUtil.java:49) at cn.xls.util.CityInfoUtil.main(CityInfoUtil.java:78)
每次打印了四分之一左右的数据后就会超时,请问该怎么解决啊?
正在学习怎么用jsoup爬取数据,请多指教
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html
...解决了,网站响应太慢了,我把超时时间又加了十倍,慢慢的全部打印完了...