- #!/usr/bin/env python
- __author__ = "Dan Deng (sixu05202004@gmail.com)"
- __version__ = "0.1.0"
- __copyright__ = "Copyright (c) 2011-2012 Dan"
- __license__ = "New-style BSD"
- from BeautifulSoup import BeautifulSoup
- import re, urllib2, pickle
- pattern=re.compile('/job-001.*')
- num=0
- test={}
- for i in range(1,6):
- url='http://www.yingjiesheng.com/beijing-morebbsjob-'+str(i)+'.html'
- pagesource=urllib2.urlopen(url)
- soup=BeautifulSoup(pagesource)
- result=soup.findAll('a',href=pattern)
- for j in range(0,len(result)):
- test[str(result[j]['href'])]=str(result[j].string)
- soup.clear()
- f=open('D:temp1.plk','rb')
- test2=pickle.load(f)
- f.close()
- for eachline in test.iterkeys():
- if eachline not in test2.iterkeys():
- print 'http://www.yingjiesheng.com'+eachline,test[eachline].decode('utf-8').encode('utf-8')
- num+=1
- print "the count of update jobs is %d" % num
- if num:
- f=open('D:temp1.plk','wb')
- pickle.dump(test,f,-1)
- f.close()
- # for eachinle in result:
- # if eachline not in joblist:
- # joblist.append()
抓取应届生的职位信息的代码
抓取应届生的职位信息的代码如下:(前五页)