package com.reptilian; import com.reptilian.TestReptilianKeyWord; import com.type.RedioType; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; /** * @description:爬虫demo * @author:li * @creat:2018-05-19 19:52 **/ public class TestReptilianKeyWord { private static BlockingQueue<String> queue=new LinkedBlockingQueue<String>(); private int count=0; public TestReptilianKeyWord(){ ExecutorService service = Executors.newCachedThreadPool(); for (int i = 0; i <10; i++) { TestReptilianKeyWord.ThreagDownload download=new TestReptilianKeyWord.ThreagDownload(queue); service.execute(download); } service.shutdown(); } public String[] runBeautiful(String keyWord) { long time1=System.currentTimeMillis(); for (int i = 2; i <=168 ; i++) { String url="http://pic.netbian.com/4kmeinv/index_"+i+".html"; start(url,keyWord); } return returnStr(time1,keyWord); } public String[] runGame(String keyWord) { long time1=System.currentTimeMillis(); for (int i = 0; i <=27 ; i++) { String url="http://pic.netbian.com/e/search/result/index.php?page="+i+"&searchid=377"; start(url,keyWord); } return returnStr(time1,keyWord); } public String[] returnStr(long time1,String keyWord) { long time2=System.currentTimeMillis(); String str1="一共抓取"+count+"张"+keyWord+"图片。"; String str2="共用时"+(time2-time1)/1000+"秒。"; String[] str=new String[]{str1,str2}; return str; } public void start(String url,String keyWord){ ArrayList<String> list=getNextUrl(getHtmlByUrl(url),keyWord); for (int j = 0,k=list.size(); j <k ; j++) { String path=list.get(j); count++; //1.获得网页源代码 String code=getHtmlByUrl(path); getImgFromCode(code); } } /** * 获得网址的源代码 * @param url * @return */ public static String getHtmlByUrl(String url){ URL urlObj=null; URLConnection connection=null; InputStreamReader reader=null; BufferedReader bufferedReader=null; StringBuilder builder=new StringBuilder(); String str=null; try { urlObj=new URL(url); connection=urlObj.openConnection(); connection.connect(); reader=new InputStreamReader(connection.getInputStream(),"gbk"); bufferedReader=new BufferedReader(reader); while ((str=bufferedReader.readLine())!=null){ builder.append(str+"\n"); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally { try { if (reader != null) { reader.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return builder.toString(); } /** * 获得网页源代码中的所有img标签 * @param code * @return */ public static void getImgFromCode(String code){ Document document=Jsoup.parse(code); Elements elements=document.select("div[class=photo-pic]").select("img"); int k=elements.size(); for (int i = 0; i < k; i++) { String str=elements.get(i).attr("src"); String path="http://pic.netbian.com/"+str; queue.offer(path); } } public static ArrayList<String> getNextUrl(String code,String keyWord){ boolean result=isEmpy(keyWord); Document document=Jsoup.parse(code); Elements elements=document.select("ul[class=clearfix]").select("img"); Elements elements2=document.select("ul[class=clearfix]").select("a"); ArrayList<String> list=new ArrayList<String>(); int k=elements.size(); for (int i = 0; i < k; i++) { if(result) { String str=elements.get(i).attr("alt"); if(str.indexOf(keyWord)==-1) { continue; } } String url = elements2.get(i).attr("href"); list.add("http://pic.netbian.com" + url); } return list; } /** * 下载图片 * @param str */ public static void download( String str){ InputStream in=null; FileOutputStream out=null; URL url=null; try { String imgNname=System.currentTimeMillis()+str.substring(str.lastIndexOf("."),str.length()); url=new URL(str); in=url.openStream(); out=new FileOutputStream(new File("E:\\imgs/"+imgNname)); byte[] b=new byte[2048]; int len=0; while ( (len=in.read(b))!=-1){ out.write(b,0,len); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally { if(out!=null){ try { out.close(); } catch (IOException e) { e.printStackTrace(); } } if(in!=null){ try { in.close(); } catch (IOException e) { e.printStackTrace(); } } } } private class ThreagDownload extends Thread { private BlockingQueue<String> messageQueue;//阻塞队列--当队列中填满数据的情况下,生产者端的所有线程都会被自动阻塞 //(挂起),直到队列中有空的位置,线程被自动唤醒。 ThreagDownload( BlockingQueue<String> messageQueue) { this.messageQueue = messageQueue; } @Override public void run() { while (true) { try { schedule(messageQueue.take()); } catch (Exception e) { } } } private void schedule(String url) { if (url == null) { return; } download(url); } } public static boolean isEmpy(String str) { if(str!=null&&!"".equals(str)&&!"null".equals(str)) { return true; } return false; } }