# --coding:UTF-8 --
import multiprocessing as mp
from multiprocessing import Queue
import BeautifulSoup as bs4
from Queue import Empty
import urllib
import json
”’
getPageData 类似于生产者,获取工作列表。 getDetail 类似消费者,去获取工作的详细信息。queue 中存放工作详情页面的ID
”’
interface = ‘http://www.lagou.com/jobs/positionAjax.json?px=default&yx=10k-15k&needAddtiOnalResult=false’
detailUrl = ‘http://www.lagou.com/jobs/{0}.html’
def getPageData(task, queue, keyword=‘python’):
while True:
try:
page = task.get(timeout=1)
except Empty:
break
post_data = {‘kd’: keyword, ‘pn’: page, ‘first’: ‘false’}
opener = urllib.urlopen(interface, urllib.urlencode(post_data))
jsOnData= json.loads(opener.read())
results = jsonData[‘content’][‘positionResult’][‘result’]
for result in results:
queue.put(result[‘positionId’])
def getDetail(queue, result):
while True:
try:
positiOnId= queue.get(timeout=1)
except Empty:
print mp.current_process().name + ‘exit’
break
url = detailUrl.format(positionId)
print url, mp.current_process().name
opener = urllib.urlopen(url)
html = opener.read()
soup = bs4.BeautifulSoup(html)
cOntent= soup.findAll(attrs={“class”: “job_bt”})[0]
result.put(‘{0} {1}’.format(detailUrl.format(positionId), content))
def start(keyword=‘python’):
task = Queue()
queue = Queue()
result = Queue()
post_data = {‘kd’: keyword, ‘pn’: 1, ‘first’: ‘true’}
opener = urllib.urlopen(interface, urllib.urlencode(post_data))
jsOnData= json.loads(opener.read())
# 页数
totalCount = jsonData[‘content’][‘positionResult’][‘totalCount’]
resultSize = jsonData[‘content’][‘positionResult’][‘resultSize’]
pageNums = totalCount / resultSize
if totalCount % resultSize:
pageNums += 1
results = jsonData[‘content’][‘positionResult’][‘result’]
for r in results:
queue.put(r[‘positionId’])
# 调试前三页
pageNums = 3
for i in range(2, pageNums + 1):
task.put(i)
num_cOnsumers= mp.cpu_count()
processes = [mp.Process(target=getDetail, args=(queue, result))
for _ in range(num_consumers)]
processes.append(mp.Process(target=getPageData, args=(task, queue)))
for p in processes:
p.start()
for p in processes:
p.join()
print ‘processes over’
with open(‘jobs’, ‘w+’) as f:
while not result.empty():
a = result.get()
f.write(a)
if name == ‘main’:
start()
2025年dockerdesktop运行错误(docker desktop is shutting down)
dockerdesktop运行错误(docker desktop is shutting down)coding UTF 8 import multiprocess as mpfrom multiprocess import Queueimport BeautifulSou as bs4from Queue import Emptyimport urllibimport json getPageData 类似于生产者 获取工作列表
大家好,我是讯享网,很高兴认识大家。
codeorg网站(code org网站)
上一篇
2025-05-16 08:55
2025年如何下载pymysql安装包(如何安装pymssql)
下一篇
2025-04-29 17:11

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/156463.html