文章 - 获取和验证HTTP代理的脚本

#coding = utf-8
import httplib
import time
import urllib
import threading
import urllib2
import urllib2 as url
import random
import httplib
import urllib
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
        (KHTML, like Gecko) Element Browser 5.0', \
        'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
        Version/6.0 Mobile/10A5355d Safari/8536.25', \
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/28.0.1468.0 Safari/537.36', \
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']

def get_proxy():
    pf = open('proxy.txt' , 'w')
    for page in range(1, 200):
        url = '<http://www.xici.net.co/nn/'>; + str(page)
        request = urllib2.Request(url)
        user_agent = random.choice(user_agents)
        request.add_header('User-agent', user_agent)
        response = urllib2.urlopen(request)
        html = response.read()

        soup = BeautifulSoup(html,'lxml')
        trs = soup.find('table', id='ip_list').find_all('tr')

        for tr in trs[1:]:
            tds = tr.find_all('td')
            ip = tds[1].text.strip()
            port = tds[2].text.strip()
            protocol = tds[5].text.strip()
            if protocol == 'HTTP':
                pf.write('%s=%s:%s\n' % (protocol, ip, port) )
                print '%s=%s:%s' % (protocol, ip, port)
    pf.close()

def var_proxy():
    inFile = open('proxy.txt', 'r')
    outFile = open('available.txt', 'w')
    lines = inFile.readlines()
    inFile.close()
    #for line in inFile.readlines():
    def check(line):  
        protocol, proxy = line.strip().split('=')
        ip, port = proxy.split(':')
        headers = {'Content-Type': 'application/x-www-form-urlencoded',
                   'Cookie': '',
                   'Referer':'[http://www.baidu.com'}](http://www.baidu.com%27%7D/)
        try:
            conn = httplib.HTTPConnection(proxy, timeout=3.0)
            conn.request(method='GET', url='<http://1212.ip138.com/ic.asp',headers=headers>)
            res = conn.getresponse()
            html_doc = res.read()
            if html_doc.find(ip)>0:
                print proxy
                outFile.write('%s\n' % (proxy) )
            else:
                print "error!"
        except Exception, e:
            print e
            pass
    pool = ThreadPool(20)
    result = pool.map(check,lines)
    pool.close()
    pool.join()
    outFile.close()

if **name** == '**main**':
    print "getproxy 1"
    print "varproxy 2"
    flag = raw_input()
    if flag == "1":
        get_proxy()
    elif flag == "2":
        var_proxy()
    else:
        print "input error!"