2025年python3 27270网站美女爬虫(二)

python3 27270网站美女爬虫(二)对于 27270 网站美女 进行了一个尝试 使用 python3 库 urllib BeautifulSou lxml 主要是下载静态 网页的图片 显示从 IP 代理网站上获取代理池 然后在从静态页面获取图片的连接 最后下载图片 1 爬虫下载 IP 代理 2 模拟浏览器下载 分析页面

大家好,我是讯享网,很高兴认识大家。
python3 库: urllib BeautifulSoup lxml 

讯享网

主要是下载静态网页的图片

显示从IP代理网站上获取代理池,然后在从静态页面获取图片的连接,最后下载图片


讯享网

分析页面

讯享网<body> ....省略其他页面代码 <div> ....省略其他页面代码 <div class="MeinvTuPianBox"> <ul> ....省略其他页面代码 <li> <a href="*" title="" class="MMPic" target="_blank"><i><img src="*" width="190" height="280" alt="*" /></i></a> ....省略其他页面代码 </li> ....省略其他页面代码 </div>

从上面可以看出页面各个元素之间的关系,确定好要找元素的位置

body > div > div class=MeinvTuPianBox > ul > li > a class=MMPic > i > img

完整的代码

讯享网from urllib.request import urlopen import urllib.request from bs4 import BeautifulSoup import os, time import http.cookiejar import random from urllib.request import urlretrieve ,HTTPError ,urlopen,URLError base_url='http://www.27270.com/'#ent/meinvtupian/' #list_11_%s.html'; one_url=['word'] base_dir='' proxy_ip=[] #class myThread (threading.Thread): # def __init__(self, start,end): # threading.Thread.__init__(self) # #self.threadID = threadID # self.start = start # self.end = end #  # def run(self): # print ("开始线程:" + self.name) # #print_time(self.name, self.counter, 5) # get_url_list( self.start,self.end ) # print ("退出线程:" + self.name) #ip代理池 def getProxyIp(): proxy = [] for i in range(1, 3): #print(i)  header = { 
  
    
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Ubuntu Chromium/44.0.2403.89 ' 'Chrome/44.0.2403.89 ' 'Safari/537.36'} req = urllib.request.Request(url='http://www.xicidaili.com/nt/{0}'.format(i), headers=header) r = urllib.request.urlopen(req) soup = BeautifulSoup(r,'html.parser',from_encoding='utf-8') table = soup.find('table', attrs={ 
  
    
  'id': 'ip_list'}) tr = table.find_all('tr')[1:] #解析得到代理ip的地址,端口,和类型  for item in tr: tds = item.find_all('td') temp_dict = {} kind = "{0}:{1}".format(tds[1].get_text().lower(), tds[2].get_text()) proxy.append("http://"+kind) return proxy #随机获取IP地址 def getIP(): ip=random.choice(proxy_ip) return ip def makeMyOpener(head={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }): proxy_dict=getIP() print(proxy_dict) cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) elem=('http',proxy_dict) header.append(elem) opener.addheaders = header return opener #图片下载 def download(url,file_name,index): dir=base_dir+str(index)+'/' if not os.path.isdir(dir): os.makedirs(dir) dir=dir+file_name try: with urlopen(url,timeout=30) as r: content=r.read(); with open(dir,'wb') as code: #open(name,[mode[,buffering]]) 打开文件夹 code.write(content) #time.sleep(1) except : pass def get_url_list(index,end): girl_list=[] try: #if end==index: # print(u'已经全部抓取完毕') # threading.currentThread().stop()  oper = makeMyOpener() url='http://www.27270.com/ent/meinvtupian/list_11_%s.html' % index
        html = oper.open(url)

        #第一种方法 #bsObj = BeautifulSoup(html,'lxml') #girl_list = bsObj.findAll('img') #第二种方法
        soup = BeautifulSoup(html,'lxml') girl_list = soup.select('body > div > div.MeinvTuPianBox > ul > li > a.MMPic > i > img') if not girl_list: print(u'已经全部抓取完毕') sys.exit(0) #第三寻找元素方法 #response = requests.get(image_detail_link).content #sel = html.fromstring(html)  #girl_list =sel.xpath("//div[@class='MeinvTuPianBox']/ul/li/a[@class='MMPic']/i/img")[0] mm_down = [] mm_names = [] #第四种方法 正则,此处略 for mpoto in girl_list: mm_link = mpoto.get('src') mm_nick = mpoto.get('alt') mm_down.append(mm_link) mm_names.append(mm_nick) for gril,name in zip(mm_down,mm_names): download(gril, name + '.jpg',index) print(gril+name) index=index+1 get_url_list(index,end) except HTTPError as e: print('HTTPError'+str(e.code)) get_url_list(index,end) except URLError as e: print('URLError'+e) get_url_list(index,end) #return girl_list  if __name__ == '__main__': proxy_ip=getProxyIp() base_dir='E:/cache-work/python3/images1/' if not os.path.isdir(base_dir): os.makedirs(base_dir) get_url_list(163,100) """ try: _thread.start_new_thread( get_url_list, ( 1,35, ) ) _thread.start_new_thread(get_url_list, ( 35,70, ) ) _thread.start_new_thread( get_url_list, ( 70,110, ) ) _thread.start_new_thread( get_url_list, ( 110,150, ) ) _thread.start_new_thread( get_url_list, ( 150,500,) ) except: print ("Error: 无法启动线程") while 1: pass """ """ thread1= myThread( 1,35) thread2= myThread(35,70) thread3= myThread(70,110) thread4= myThread(110,150) thread5= myThread(150,1000) thread1.start() thread2.start() thread3.start() thread4.start() thread5.start() """ # 创建两个线程 
小讯
上一篇 2025-03-29 15:21
下一篇 2025-02-18 18:25

相关推荐

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/48501.html