import org.apache.http.HttpHeaders; import org.apache.http.HttpHost; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.dom4j.Document; import org.dom4j.DocumentHelper; import org.dom4j.Element; import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main { CloseableHttpClient httpClient; static int bookId = 496; Map proxyMap;//ip->端口 List ipList;//从这个list中读出ip,再由ip从map中读出端口 int i = 0;//根据这个从list中取出ip,换上对应的代理 public static void main(String[] args) { Main m = new Main(); // List tagList = m.getTagList(); List tagList = new LinkedList(); // tagList.add("经典"); // tagList.add("日本文学"); // tagList.add("散文"); // tagList.add("中国文学"); // tagList.add("算法"); // tagList.add("童话"); // tagList.add("外国文学"); // tagList.add("文学"); // tagList.add("小说"); // tagList.add("漫画"); // tagList.add("诗词"); // tagList.add("心理学"); tagList.add("摄影"); tagList.add("理财"); tagList.add("经济学"); m.pullAndWrite(tagList,10); } public Main() { // HttpHost proxy = new HttpHost("122.225.106.35",80); // httpClient = HttpClients.custom().setProxy(proxy).build(); httpClient = HttpClients.createDefault(); setProxyMap(); } public void setProxyMap() { proxyMap = new HashMap(); ipList = new LinkedList(); proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171"); } public List getTagList() { HttpGet getTag = new HttpGet("http://book.douban.com/tag/"); getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30"); CloseableHttpResponse tagPageResponse = null; String tagPageCode = null;//网页源码 try { tagPageResponse = httpClient.execute(getTag); tagPageCode = EntityUtils.toString(tagPageResponse.getEntity()); tagPageResponse.close(); } catch (IOException e) { e.printStackTrace(); } finally { try { tagPageResponse.close(); } catch (IOException e) { e.printStackTrace(); } } Pattern p = Pattern.compile("class=\"tag\">(.*?)"); Matcher m = p.matcher(tagPageCode); List resultTagList = new LinkedList(); while (m.find()) { resultTagList.add(m.group(1)); } return resultTagList; } /** * * @param tagList 要抓的图书的类别 * @param maxPageNum 每种图书最多抓取的页数 */ public void pullAndWrite(List tagList,int maxPageNum) { Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)"); //获取具体书籍网址的正则 Pattern bookAuthorRegex = Pattern.compile("(?s) 作者:.*?>(.*?)");//匹配作者 Pattern bookPublishRegex = Pattern.compile("出版社: (.*?)
"); Pattern bookIsbnRegex = Pattern.compile("ISBN: (.*?)
"); Pattern bookImgRegex = Pattern.compile(" threadList = new LinkedList(); while (m.find()) { threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex)); findCount++; } //没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找 if (findCount == 0) { break; } for (Thread thread:threadList) { thread.start(); } for (Thread thread:threadList) { try { thread.join(); } catch (InterruptedException e) { e.printStackTrace(); } } nowPageNum++; } //一个类别爬完了再写入 new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start(); //另开一个线程写入文件 } } private void changeProxy() { if (i >= ipList.size()) { System.out.println("代理用完了,退出"); System.exit(0); } String ip = ipList.get(i++); httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build(); System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip)); } }