import requests import re import json from lxml import etree import os import threading from queue import Queue ''' Time:2020年6月16日18:40:20 By:Eg. ''' ''' 爬取weibo某话题下评论中的图片 https://m.weibo.cn/status/4489758501815678? https://weibo.com/6524978930/IF9XHxJpF?type=repost#_rnd1588381011696 ''' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} cookies = '' #取消掉证书warning requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) def visitorlogin(): ''' visitorlogin return cookies ''' #-------游客登录1 url = 'https://passport.weibo.com/visitor/genvisitor' data = {'cb':'gen_callback', 'fp':json.dumps({"os":"1", "browser":"Chrome81,0,4044,129", "fonts":"undefined", "screenInfo":"1280*800*24", "plugins":"Portable Document Format::internal-pdf-viewer::Chrome PDF Plugin|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client" }) } r = requests.post(url,data=data,headers=headers,verify=False) text = re.findall('gen_callback\((.*?)\);',r.text)[0] tid = json.loads(text)['data']['tid'] #-------游客登录2 params = {'a':'incarnate', 't':tid, 'cb':'cross_domain' } url = 'https://passport.weibo.com/visitor/visitor' r = requests.get(url,params=params,headers=headers,verify=False) text = re.findall('cross_domain\((.*?)\);',r.text)[0] sub = json.loads(text)['data']['sub'] subp = json.loads(text)['data']['subp'] #-------游客登录3 url = 'https://login.sina.com.cn/visitor/visitor' params = {'a':'crossdomain', 's':sub, 'sp':subp, 'cb':'return_back' } r = requests.get(url,params=params,headers=headers,verify=False) return r.cookies.get_dict() def base62_decode(string): ''' 来自万能的百度 ''' alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" base = len(alphabet) strlen = len(string) num = 0 idx = 0 for char in string: power = (strlen - (idx + 1)) num += alphabet.index(char) * (base ** power) idx += 1 return num def mid_to_url(midint): ''' 来自万能的百度 ''' url = midint url = str(url)[::-1] size = len(url) / 4 if len(url) % 4 == 0 else len(url) / 4 + 1 result = [] for i in range(int(size)): s = url[i * 4: (i + 1) * 4][::-1] s = str(base62_decode(str(s))) s_len = len(s) if i < size - 1 and s_len < 7: s = (7 - s_len) * '0' + s result.append(s) result.reverse() return int(''.join(result)) class Th(threading.Thread): def __init__(self, queue): threading.Thread.__init__(self) self.__queue = queue def run(self): while True: test = self.__queue.get() saveimg(test[0],test[1],test[2]) self.__queue.task_done() queue = Queue(20)#线程数 for i in range(queue.maxsize): t = Th(queue) t.setDaemon(True) t.start() def saveimg(weiboid,comment_id,img_url): #print(img_url) filename = '{}/{}{}'.format(weiboid,comment_id,os.path.basename(img_url)) if os.path.exists(filename) == False: with open(filename,'wb') as f: while True: try: r_img = requests.get(img_url,headers=headers,cookies=cookies,timeout=5) except: print('下载pic出错',img_url) else: if r_img.status_code == 200: break f.write(r_img.content) def getweiboid(string): ''' https://m.weibo.cn/status/4489758501815678? 这种直接出4489758501815678 https://m.weibo.cn/status/J5PjY66K0? https://weibo.com/6524978930/IF9XHxJpF?type=repost#_rnd1588381011696 这种需要转换IF9XHxJpF ''' if 'm.weibo' in string: #目测都是16位 weiboid = re.findall('/(\d{16})',string)[0] else: weiboid = re.findall('/\d+/([A-Za-z0-9]*)',string)[0] weiboid = mid_to_url(weiboid) return weiboid def spider(weibourl): global cookies weiboid = getweiboid(weibourl) #时间排列api url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&filter=all&from=singleWeiBo'.format(weiboid) if os.path.exists(weiboid) == False: os.makedirs(weiboid) #循环获取每一页 while True: print(url) tt = True retry = 0 #重试专用 while tt: r = requests.get(url,headers=headers,cookies=cookies) html = etree.HTML(json.loads(r.text)['data']['html']) #单个评论div for div in html.xpath('/html/body/div/div/div[@class="list_li S_line1 clearfix"]'): comment_id = div.xpath('./@comment_id')[0] #通常用这种来展示 for imgurl in div.xpath('./div/div/div/ul/li/img/@src'): #替换thumb180字段来获取高清大图 img_url = 'https:' + imgurl.replace('thumb180','large') print('piclink:',img_url) #saveimg(weiboid,comment_id,img_url) queue.put([weiboid,comment_id,img_url]) #有时候weibo会以这种方式展示,代表请求太快 #for imgurl in div.xpath('./div/div/a[@title="网页链接"]/@href'): if len(div.xpath('./div/div/a[@title="网页链接"]/@href')) ==0: tt = False else: cookies = visitorlogin() retry = retry + 1 print('retry',retry) #获取下一页地址 baseurl = 'https://weibo.com/aj/v6/comment/big?{}&from=singleWeiBo' #'&from=singleWeiBo'不能去掉,不然会出现各种balabala问题 try: url = baseurl.format(html.xpath('/html/body/div/div/div[@node-type="comment_loading"]/@action-data')[0]) except IndexError: try: url = baseurl.format(html.xpath('/html/body/div/div/div/div/a[@class="page next S_txt1 S_line1"]/span/@action-data')[0]) except IndexError: try: url = baseurl.format(html.xpath('/html/body/div/div/a[@action-type="click_more_comment"]/@action-data')[0]) except IndexError: ## with open('test.html','w',encoding='utf-8') as f: ## f.write(json.loads(r.text)['data']['html']) break #没意外的话应该是已经获取完 cookies = visitorlogin() if __name__ == '__main__': cookies = visitorlogin() print('cookies:',cookies) spider('https://m.weibo.cn/detail/4511863448505629?') # spider('https://m.weibo.cn/status/4489758501815678?') queue.join()#等待下载完成