package com.zhanzhang.tools; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.Document; import org.w3c.dom.Element; public class 去广告链接 { final static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); public static String parse302URL(String url) { String location = null; try { URL serverUrl = new URL(url); HttpURLConnection conn = (HttpURLConnection) serverUrl .openConnection(); conn.setRequestMethod("GET"); conn.setConnectTimeout(10000); conn.setReadTimeout(10000); // 必须设置false,否则会自动redirect到Location的地址 conn.setInstanceFollowRedirects(false); conn.addRequestProperty("Accept-Charset", "UTF-8;"); conn.addRequestProperty( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); conn.addRequestProperty("Referer", "http://www.zuidaima.com/"); conn.connect(); location = conn.getHeaderField("Location"); conn.disconnect(); } catch (Exception e) { e.printStackTrace(); } return location; } public static String request(String url) { StringBuffer res = new StringBuffer(); HttpURLConnection conn = null; try { URL serverUrl = new URL(url); conn = (HttpURLConnection) serverUrl.openConnection(); conn.setRequestMethod("GET"); conn.setConnectTimeout(10000); conn.setReadTimeout(10000); conn.addRequestProperty("Accept-Charset", "UTF-8;"); conn.addRequestProperty( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); conn.addRequestProperty("Referer", "http://www.zuidaima.com/"); conn.connect(); InputStream ins = conn.getInputStream(); String charset = "UTF-8"; InputStreamReader inr = new InputStreamReader(ins, charset); BufferedReader bfr = new BufferedReader(inr); String line = ""; do { res.append(line); line = bfr.readLine(); } while (line != null); inr.close(); bfr.close(); } catch (Exception e) { System.out.println("###error:" + e.getMessage() + " at " + new Date()); if (e.toString().indexOf("FileNotFound") != -1) { res.append("404"); } } finally { if (conn != null) { conn.disconnect(); } } return res.toString(); } public static List<Web> parseWebNavs(String html) { List<Web> webs = new ArrayList<Web>(); Pattern pattern = Pattern .compile("<h3 class=\"t\"><a[\\s\\S]*?href.*?\"([\\s\\S]*?)\"[\\s\\S]*?>([\\s\\S]*?)</a>"); Matcher matcher = pattern.matcher(html); while (matcher.find()) { String _url = matcher.group(1); String name = matcher.group(2); name = name.replaceAll("<.*?>", " ").trim(); String url = parse302URL(_url); if (url == null) { url = _url; } if (webs.contains(url)) { continue; } Web web = new Web(url, name); webs.add(web); } return webs; } public static List<Web> parseURL(String keyword) throws IOException { String _keyword = URLEncoder.encode("site:zuidaima.com " + keyword, "utf-8"); List<Web> webs = new ArrayList<Web>(); int p = 1; int s = 10; while (true) { int pn = (p - 1) * s; String url = "http://www.baidu.com/s?wd=%s&pn=%s&ie=utf-8&usm=1&rsv_page=1"; System.out.println("Start to parse " + keyword + " " + p); String _url = String.format(url, _keyword, pn + ""); System.out.println(p + " Request url " + _url); String html = request(_url); List<Web> _webs = parseWebNavs(html); for (Web web : _webs) { if (!request(web.getHome()).equals("404")) { System.out.println("过滤掉:" + web.getName()); continue; } if (webs.contains(web)) { continue; } webs.add(web); } if (html.indexOf("下一页") == -1) { break; } p++; try { Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } } return webs; } public static Element createUrlElement(Document document, String loc, String priority, String lastmod, String changefreq) { Element element = document.createElement("url"); Element locElement = document.createElement("loc"); locElement.appendChild(document.createTextNode(loc)); element.appendChild(locElement); Element priorityElement = document.createElement("priority"); priorityElement.appendChild(document.createTextNode(priority)); element.appendChild(priorityElement); Element lastmodElement = document.createElement("lastmod"); lastmodElement.appendChild(document.createTextNode(lastmod)); element.appendChild(lastmodElement); Element changefreqElement = document.createElement("changefreq"); changefreqElement.appendChild(document.createTextNode(changefreq)); element.appendChild(changefreqElement); return element; } public static void main(String[] args) throws Exception { List<Web> webs = new ArrayList<Web>(); String keywords = "学校,学院,办理,毕业证"; String[] _keywords = keywords.split(","); for (String keyword : _keywords) { List<Web> _webs = parseURL(keyword); webs.addAll(_webs); } DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document document = builder.newDocument(); Element root = document.createElement("urlset"); document.appendChild(root); Date now = new Date(); String loc = null; String priority = null; String lastmod = null; String changefreq = null; for (Web web : webs) { loc = web.getHome(); priority = "0.8"; lastmod = sdf.format(now); changefreq = "daily"; Element shareElement = createUrlElement(document, loc, priority, lastmod, changefreq); root.appendChild(shareElement); } File file = new File("c:/sitemap_trash.xml"); if (!file.getParentFile().exists()) { file.mkdirs(); } if (file.exists()) { file.delete(); } else { file.createNewFile(); } TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer = tf.newTransformer(); DOMSource source = new DOMSource(document); transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); PrintWriter pw = new PrintWriter(new FileOutputStream(file)); StreamResult result = new StreamResult(pw); transformer.transform(source, result); // String html = request("http://www.zuidaima.com/share/k%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E3%80%90%E8%81%94%E7%B3%BBQQ%EF%BC%9A931957539%E3%80%91%E5%8A%9E%E7%90%86%E4%B8%8A%E6%B5%B7%E4%B8%AD%E4%BE%A8%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2%E6%96%87%E5%87%AD%E2%98%852014%E5%B9%B412%E6%9C%8829%E6%97%A5roab9f-p1-s1.htm"); // System.out.println(html); } }