自动更改IP地址反爬虫封锁

/**

*

  • APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,

  • yang-shangchuan@qq.com

*

  • This program is free software: you can redistribute it and/or modify it under

  • the terms of the GNU General Public License as published by the Free Software

  • Foundation, either version 3 of the License, or (at your option) any later

  • version.

*

  • This program is distributed in the hope that it will be useful, but WITHOUT

  • ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

  • FOR A PARTICULAR PURPOSE. See the GNU General Public License for more

  • details.

*

*

*/

package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;

import org.apdplat.superword.model.Word;

import org.jsoup.Connection;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.*;

import java.nio.file.*;

import java.nio.file.attribute.BasicFileAttributes;

import java.util.*;

import java.util.concurrent.atomic.AtomicInteger;

import java.util.stream.Collectors;

/**

  • 利用爱词霸筛选词表中属于各大考试的词

  • 提取爱词霸页面中的自定义信息

  • 考虑到爱词霸的防爬虫限制,特提供包含61821个单词的爱词霸HTML页面origin_html.zip文件供下载

  • 下载地址http://pan.baidu.com/s/1bnD9gy7

  • @author 杨尚川

*/

public class WordClassifier {

private WordClassifier(){}

private static final Logger LOGGER = LoggerFactory.getLogger(WordClassifier.class);

private static final String ICIBA = "http://www.iciba.com/";

private static final

String TYPECSSPATH

"html body.bgmain div#layout div#center div#mainbox div#dictmain div.dictbar div.wdgenre a";

private static final

String UNFOUNDCSSPATH

"html body.bgmain div#layout div#center div#mainbox div#dictmain div#question.question.unfoundtips";

private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8";

private static final String ENCODING = "gzip, deflate";

private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";

private static final String CONNECTION = "keep-alive";

private static final String HOST = "www.iciba.com";

private static final String REFERER = "http://www.iciba.com/";

private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";

private static final Set NOTFOUNDWORDS = new HashSet<>();

private static final Set ORIGIN_HTML = new HashSet<>();

public static void classify(Set words){

LOGGER.debug("待处理词数目:"+words.size());

AtomicInteger i = new AtomicInteger();

Map<String,

List> data

new HashMap<>();

words.forEach(word -> {

if(i.get()%1000

999){

save(data);

}

showStatus(data, i.incrementAndGet(), words.size(), word.getWord());

String html = getContent(word.getWord());

//LOGGER.debug("获取到的HTML:" +html);

while(html.contains("非常抱歉,来自您ip的请求异常频繁")){

//使用新的IP地址

DynamicIp.toNewIp();

html = getContent(word.getWord());

}

if(StringUtils.isNotBlank(html)) {

parse(word.getWord(), html, data);

if(!NOTFOUNDWORDS.contains(word.getWord())) {

ORIGIN_HTML.add(word.getWord()

  • "杨尚川"
  • html);

}

}else{

NOTFOUNDWORDS.add(word.getWord());

}

});

//写入磁盘

save(data);

LOGGER.debug("处理完毕,总词数目:"+words.size());

}

public static void parse(String path){

if(path.endsWith(".zip")){

parseZip(path);

}

if(Files.isDirectory(Paths.get(path))){

parseDir(path);

}else{

parseFile(path);

}

}

public static void parseDir(String dir) {

LOGGER.info("开始解析目录:"

  • dir);

try {

Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor() {

@Override

public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {

parseFile(file.toFile().getAbsolutePath());

return FileVisitResult.CONTINUE;

}

});

} catch (IOException e) {

LOGGER.error("解析文本出错", e);

}

}

public static void parseZip(String zipFile){

LOGGER.info("开始解析ZIP文件:"+zipFile);

try (FileSystem fs

FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {

for(Path path : fs.getRootDirectories()){

LOGGER.info("处理目录:"+path);

Files.walkFileTree(path, new SimpleFileVisitor(){

@Override

public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {

LOGGER.info("处理文件:"+file);

// 拷贝到本地文件系统

Path temp = Paths.get("target/origin-html-temp.txt");

Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);

parseFile(temp.toFile().getAbsolutePath());

return FileVisitResult.CONTINUE;

}

});

}

}catch (Exception e){

LOGGER.error("解析文本出错", e);

}

}

public static void parseFile(String file){

LOGGER.info("开始解析文件:"+file);

try (BufferedReader reader

new BufferedReader(

new InputStreamReader(

new BufferedInputStream(

new FileInputStream(file))))) {

Map<String,

List> data

new HashMap<>();

String line = null;

while ((line = reader.readLine()) != null) {

parse(line, data);

}

save(data);

} catch (IOException e) {

LOGGER.error("解析文本出错", e);

}

}

public static void parse(String html, Map> data){

LOGGER.debug("html:"+html);

String[] attr = html.split("杨尚川");

if(attr == null || attr.length != 2){

LOGGER.error("解析文本失败,文本应该以'杨尚川'分割,前面是词,后面是网页,网页内容是去除换行符之后的一整行文本:"+html);

return;

}

String word = attr[0];

LOGGER.info("解析单词:"+word);

String htm = attr[1];

parse(word, htm, data);

}

public static void showStatus(Map> data, int current, int total, String word){

LOGGER.debug("开始处理词 "+current+"/"+total+" ,完成进度 "+current/(float)total*100+"% :"+word);

data.entrySet().forEach(e -> {

LOGGER.debug(e.getKey()+"\t"+e.getValue().size());

});

}

public static void save(Map> data){

LOGGER.info("将数据写入磁盘,防止丢失");

data.keySet().forEach(key -> {

try {

String path = "src/main/resources/word_"

  • key
  • ".txt";

LOGGER.error("保存词典文件:"

  • path);

List existWords

Files.readAllLines(Paths.get(path));

Set allWords

new HashSet<>();

existWords.forEach(line -> {

String[] attr = line.split("\s+");

if(attr != null) {

String w = "";

if(attr.length

1){

w = attr[0];

}

if(attr.length

2){

w = attr[1];

}

allWords.add(w);

}

});

allWords.addAll(data.get(key));

AtomicInteger i = new AtomicInteger();

List list = allWords

.stream()

.sorted()

.map(word -> i.incrementAndGet()+"\t"

  • word)

.collect(Collectors.toList());

Files.write(Paths.get(path), list);

data.get(key).clear();

existWords.clear();

allWords.clear();

list.clear();

}catch (Exception e){

LOGGER.error("保存词典文件失败", e);

}

});

data.clear();

try {

if(!NOTFOUNDWORDS.isEmpty()) {

String path = "src/main/resources/wordnotfound.txt";

LOGGER.error("保存词典文件:"

  • path);

AtomicInteger i = new AtomicInteger();

//NOTFOUNDWORDS比较少,常驻内存

List list

NOTFOUNDWORDS

.stream()

.sorted()

.map(word -> i.incrementAndGet()

  • "\t"
  • word)

.collect(Collectors.toList());

Files.write(Paths.get(path), list);

list.clear();

}

//保存原始HTML

if(!ORIGIN_HTML.isEmpty()) {

String path = "src/main/resources/originhtml"

  • System.currentTimeMillis()
  • ".txt";

LOGGER.error("保存词典文件:"

  • path);

Files.write(Paths.get(path), ORIGIN_HTML);

ORIGIN_HTML.clear();

}

}catch (Exception e){

LOGGER.error("保存词典文件失败", e);

}

}

public static String getContent(String word) {

String url = ICIBA

  • word + "?renovate="
  • (new Random(System.currentTimeMillis()).nextInt(899999)+100000);

LOGGER.debug("url:"+url);

Connection conn = Jsoup.connect(url)

.header("Accept", ACCEPT)

.header("Accept-Encoding", ENCODING)

.header("Accept-Language", LANGUAGE)

.header("Connection", CONNECTION)

.header("Referer", REFERER)

.header("Host", HOST)

.header("User-Agent", USER_AGENT)

.ignoreContentType(true);

String html = "";

try {

html = conn.post().html();

html = html.replaceAll("[\n\r]", "");

}catch (Exception e){

LOGGER.error("获取URL:"+url+"页面出错", e);

}

return html;

}

public static void parse(String word, String html, Map> data){

Document doc = Jsoup.parse(html);

Elements es = doc.select(TYPECSSPATH);

for(Element e : es){

String type = e.text();

LOGGER.debug("获取到的类型:"+type);

if(StringUtils.isNotBlank(type)){

data.putIfAbsent(type, new ArrayList<>());

data.get(type).add(word);

}

}

es = doc.select(UNFOUNDCSSPATH);

for(Element e : es){

String notFound = e.text();

LOGGER.debug("没有该词:"+notFound);

if(StringUtils.isNotBlank(notFound)

&& (notFound.contains("对不起,没有找到")

|| notFound.contains("您要查找的是不是"))){

NOTFOUNDWORDS.add(word);

}

}

}

public static void main(String[] args) {

//Set words = new HashSet<>();

//words.add(new Word("time", ""));

//words.add(new Word("yangshangchuan", ""));

//classify(words);

//classify(WordSources.getAll());

//parse("src/main/resources/originhtml1427060576977.txt");

//origin_html.zip包含61821个单词的爱词霸解析HTML页面,下载地址http://pan.baidu.com/s/1bnD9gy7

parse("/Users/apple/百度云同步盘/origin_html.zip");

}

}