首页 » Java » 使用gecco获取代理IP,仅测试代码用,不要用于非法用途

使用gecco获取代理IP,仅测试代码用,不要用于非法用途

2016-03-29 17:40:36阅读(1792)

这里用到了三个类
首先是gecco开始的地方,抓取ip列表

package com.geccocrawler.gecco.demo.ipcatch;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;

import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.RequestParameter;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;

@Gecco(matchUrl="http://www.kuaidaili.com/free/inha/{currPage}/", pipelines={"consolePipeline", "IpListPipeline"})
public class IpList implements HtmlBean {

    private static final long serialVersionUID = 4544492736813943899L;

    @Request
    private HttpRequest request;

    @RequestParameter("currPage")
    private String currPage;

    /**
     * 抓取列表项的详细内容,包括ip,端口等
     */
    @HtmlField(cssPath="#list")
    private List<IpDetail> ips;


    public HttpRequest getRequest() {
        return request;
    }

    public void setRequest(HttpRequest request) {
        this.request = request;
    }

    public String getCurrPage() {
        return currPage;
    }

    public void setCurrPage(String currPage) {
        this.currPage = currPage;
    }

    public List<IpDetail> getIps() {
        return ips;
    }

    public void setIps(List<IpDetail> ips) throws IOException {
        String path = "E:/ip/ip.txt";
          File file = new File(path);
          if (!file.getParentFile().exists()) {
           file.getParentFile().mkdirs();
          }
          try {
           file.createNewFile();
          } catch (IOException e) {
           e.printStackTrace();
          }
          FileWriter  pw = new FileWriter (file,true);
        String[] ipGroup = ips.get(0).getIp().split("\n");
        String[] portGroup = ips.get(0).getPort().split("\n");
        for(int i = 0;i<ipGroup.length;i++){
            String ip = ipGroup[i] + ":" + portGroup[i];
            System.out.println(ipGroup[i]);
            pw.write(ip);
            pw.write(13);
            pw.write(10);
            pw.flush(); 
        }
        pw.close();
        this.ips = ips;
    }

    public static void main(String[] args) {
        GeccoEngine.create()
        .classpath("com.geccocrawler.gecco.demo.ipcatch")
        //开始抓取的页面地址
        .start("http://www.kuaidaili.com/free/inha/1/")
        //开启几个爬虫线程,线程数量最好不要大于start request数量
        .thread(1)
        //单个爬虫每次抓取完一个请求后的间隔时间
        .interval(5000)
        .run();
    }
}

难点是jsoup无法获取没有table标签的td标签,只能得到一个字符串
下面是详细信息

package com.geccocrawler.gecco.demo.ipcatch;

import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.spider.HtmlBean;

public class IpDetail implements HtmlBean {

    private static final long serialVersionUID = 2555530396237160927L;

    @HtmlField(cssPath="td:nth-child(1)")
    private String ip;

    @HtmlField(cssPath="td:nth-child(2)")
    private String port;

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getPort() {
        return port;
    }

    public void setPort(String port) {
        this.port = port;
    }

    public void text(String ip,String port){

    }

}

然后是分页抓取

package com.geccocrawler.gecco.demo.ipcatch;

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;

@PipelineName("IpListPipeline")
public class IpListPipeline implements Pipeline<IpList> {

    @Override
    public void process(IpList ipList) {
        HttpRequest currRequest = ipList.getRequest();
        //下一页继续抓取
        String currPage = ipList.getCurrPage();
        int nextPage = Integer.parseInt(currPage) + 1;
        if(nextPage <= 100) {
            String nextUrl = "";
            String currUrl = currRequest.getUrl();
            if(currUrl.indexOf("inha") != -1) {
                nextUrl = "http://www.kuaidaili.com/free/inha/"+nextPage+"/";
            } else {
                nextUrl = currUrl + "/" + nextPage +"/";
            }
            SchedulerContext.into(currRequest.subRequest(nextUrl));
        }
    }

}

当然,这是基于gecco的,首先要引入gecco,地址:https://github.com/xtuhcy/gecco

最新发布

CentOS专题

关于本站

5ibc.net旗下博客站精品博文小部分原创、大部分从互联网收集整理。尊重作者版权、传播精品博文,让更多编程爱好者知晓!

小提示

按 Ctrl+D 键,
把本文加入收藏夹