【Python】多线程爬虫下载

网站----1

TIPS:

1、网站对图片添加了防盗链,于是,在获得了真正的图片地址后,需要在header里面添加referer来解决限制

 

效果图:

 

py代码如下:


'''
@author=lthero
'''
import re
import time
from bs4 import BeautifulSoup
import requests
import threading
import random
import string

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/98.0.4758.102 Safari/537.36 '
,'referer':'',"cookie ":'ftwwwtuao8xyz=1; HstCfa4220059=1644384603290; HstCmu4220059=1644384603290; '
                        'c_ref_4220059=https%3A%2F%2Fwww.google.com%2F; timezone=8; HstCnv4220059=3; ftwwwtuao8xyz=1; '
                        'HstCns4220059=8; HstCla4220059=1646142002976; HstPn4220059=13; HstPt4220059=169'}


class myThread(threading.Thread):
    def __init__(self, url, theNum, file_path):
        threading.Thread.__init__(self)
        self.url = url
        self.file_path = file_path
        self.theNum = theNum

    def ranstr(self, num):
        salt = ''.join(random.sample(string.ascii_letters + string.digits, num))
        return salt

    def open_url(self, url):
        response = requests.get(url, headers)
        return response.text

    def run(self):
        soup = BeautifulSoup(self.open_url(self.url + str(self.theNum)), 'lxml')
        pattern = re.compile(r'https://.*[1-9]\d*\.jpg')
        obj_images = soup.find_all(src=pattern)
        each_download_url = pattern.findall(str(obj_images[0]))[0]
        img = requests.get(each_download_url, headers=headers)
        with open('%s/%s.jpg' % (self.file_path, self.ranstr(6)), 'wb') as f:
            f.write(img.content)


# 分页加载的
class picDown():
    def __init__(self, url, save_path):
        headers['referer']=url
        self.page = '?page='
        self.url = url + self.page
        self.save_path = save_path
        for i in range(1, 35):
            thread1 = myThread(self.url, i, self.save_path)
            thread1.start()
            time.sleep(0.2)


if __name__ == '__main__':
    # 测试网址
    # https://www.tuao8.xyz/post/1316.html
    # https://www.tuao8.xyz/post/2254.html
    picDown(url=input("输入网址:"),
            save_path=input("输入要保存的文件夹位置: "))

 

打包的程序如下:

 

  • # 测试网址
  • # https://www.tuao8.xyz/post/1316.html
  • # https://www.tuao8.xyz/post/2254.html

 

运行时,要输入的网址在代码中的“测试网址”找;

或者浏览器直接访问网站,同网站的其它帖子也可以下载

文件夹位置一定要存在,否则出错

 

网站----2

不同于网站1,网站2的所有图片在同一个url里面

 

代码如下

import re
import time
from bs4 import BeautifulSoup
import requests
import threading
import random
import string

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}


class myThread(threading.Thread):
    def __init__(self, the_url, file_path):
        threading.Thread.__init__(self)
        self.url = the_url
        self.file_path = file_path

    def ranstr(self, num):
        salt = ''.join(random.sample(string.ascii_letters + string.digits, num))
        return salt

    def run(self):
        img = requests.get(self.url, headers=headers)
        with open('%s/%s.jpg' % (self.file_path, self.ranstr(6)), 'wb') as f:
            f.write(img.content)


class picDown():
    def __init__(self, url, save_path):
        self.url = url
        self.url_images = []
        self.save_path = save_path
        soup = BeautifulSoup(self.open_url(), 'lxml')

        pattern = re.compile(r'https://.*\.jpg')
        obj_images = soup.find_all(href=pattern)
        for i in obj_images:
            each_url = pattern.findall(str(i))
            self.url_images.append(each_url[0])
        self.download()

    def open_url(self):
        response = requests.get(self.url, headers)
        print(response.status_code)
        return response.text

    def download(self):
        for each_url in self.url_images:
            thread1 = myThread(each_url, self.save_path)
            thread1.start()
            time.sleep(0.1)


if __name__ == '__main__':
    # 测试网址
    # http://wushengguang.club/index.php/archives/%5BYuzuki%5D+E+Cup+Natural+Beauty+Girl+-+%2866P%29.html
    picDown(url=input("输入网址:"),
            save_path=input("输入要保存的文件夹位置: "))

 

  • # 测试网址
  • # http://wushengguang.club/index.php/archives/%5BYuzuki%5D+E+Cup+Natural+Beauty+Girl+-+%2866P%29.html

THE END
分享
二维码
< <上一篇
下一篇>>
文章目录
关闭
目 录