lxml 加 beautifulsoup with 多线程(Snail)

抓取的时候,去获取网页中指定的内容,最基本的办法就是使用正则获取,但要是要处理的网站多了,写正则岂不写死。于是哥就写了个类,看代码:

蜗牛号基类

# -*- coding: utf-8 -*-
import threading, Queue
import urllib2, cookielib
from lxml.html.soupparser import fromstring

class Snail(threading.Thread):
	def __init__(self , *argv):
		self.buildBrowser(*argv)
		super(Snail, self).__init__()
	def buildBrowser(self, *argv):
		self.cookie = cookielib.CookieJar()
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))
		opener.addheaders = []
		opener.addheaders.append(('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.904.0 Safari/535.7'))
		opener.addheaders.append(('Accept', '*/*'))
		opener.addheaders = opener.addheaders + [t for t in argv]
		urllib2.install_opener(opener)
	def parse(self, link, data = None):
		content = self.handle(link, data).read()
		tree = fromstring(content)
		return tree
	def handle(self, link, data = None):
		return urllib2.urlopen(urllib2.Request(link, data))
	def save(self):
		pass
snail = Snail(('Cookie','这边可以自己填写'))

需要lxml模块支持,可以到 http://lxml.de/ 下载,centos下的安装方法:

yum -y install libxml2-dev libxslt-devel
cd /usr/local/src
wget http://lxml.de/files/lxml-2.3.2.tgz
tar vzxf lxml-2.3.2.tgz
cd lxml-2.3.2
python setup.py install

cd ../
wget http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.0.tar.gz
tar vzxf BeautifulSoup-3.2.0.tar.gz
cd BeautifulSoup-3.2.0
python setup.py install

基类的使用实例

线程锁rlock

# -*- coding: utf-8 -*-
from Snail import Snail
import threading

rlock = threading.RLock()
class Mphone(Snail):
	def __init__(self):
		super(Snail, self).__init__()
	def process(self, numhead):
		#做点实际处理的事情
		pass
	def run(self):
		while True:
			rlock.acquire()
			#线程锁锁定资源的时候,取值
			rlock.release()
			#可以在某个特定条件下 break 掉循环
			self.process(numhead)
if __name__ == '__main__':
	threadnum = 4
	threads = []
	for i in range(threadnum):
		mphone = Mphone()
		threads.append(mphone)
	for i in range(threadnum):
		threads[i].start()
	for i in range(threadnum):
		threads[i].join()

可能会遇到的一些问题

TypeError: cannot concatenate 'str' and 'NoneType' objects
#解决办法
unicode('这边是要解析的字符串', 'utf-8', errors='ignore')

标签: none

添加新评论