GlidedSky | 爬虫题目2

# -*- coding: utf-8 -*-
"""
# Talk is cheap,show me the codes!

@Author billie
@Time 2021/05/07
@Describe

"""
import logging,threading,time
import requests
from pyquery import PyQuery as pq

logging.basicConfig(
    # filename='glidedsky.log',
    format=f"%(asctime)s %(levelname)s[%(lineno)s]: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level = logging.INFO
)

def log(func):
    def wrapper(*args,**kwargs):
        try:
            return func(*args,**kwargs)
        except Exception as e:
            logging.error(repr(e))
    return wrapper

class spider:
    def __init__(self):
        self.login_url="http://glidedsky.com/login"
        self.url = 'http://glidedsky.com/level/web/crawler-basic-2'
        self.session = requests.session()
        self.headers={
            # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            # "Accept-Encoding": "gzip, deflate",
            # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7",
            # "Cache-Control": "max-age=0",
            # "Connection": "keep-alive",
            # "Content-Length": "94",
            # "Content-Type": "application/x-www-form-urlencoded",
            # "Cookie": "footprints=eyJpdiI6IllVVkpGRXYzUzJvbTRvXC85VTc2dExnPT0iLCJ2YWx1ZSI6IlhsbUFmeDRzZFdqcW9rWlFYcm1YbUtBNkszdmRDWExWUGlTNlU5QWxqc21ybVh5NzM5TGlcL3hkdzZ4RlI1bGx0IiwibWFjIjoiZWZjOTk5NzAzMTY1YmNlMDE3M2MwYTA2MzVkYjk3Y2QxMDZlNWI3MDM4YTRhOWYxYzRiYjk5YTA3Yjc2MTJkNSJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1620240916; XSRF-TOKEN=eyJpdiI6IndYcUViWXhaWDNaUEtNeDM0SnJPTGc9PSIsInZhbHVlIjoieWFja3ZQNXd2SHRKeGxQNmI5VDRkTWFTXC9wN3NRUFdqWWtlMWpcL2poczJScXZUVTNBcVdNWXJ3NzIzcnVCZUswIiwibWFjIjoiNDQwZGYyZGY1MjI1MGQ4MDgyNDNlNGE3ODM0NmJkZGM5MjQyNjgxOGEwYWI3YTViODJlNmExNGU1MWM2OWJkMSJ9; glidedsky_session=eyJpdiI6Ink4RzhSNFwvdkdyVHVmYXFzcW4yOFFRPT0iLCJ2YWx1ZSI6InZablo4cEpQK1dJR0VQS3A4QUFxdFRwUlwvSVpSaFFNNWpcL2dlYk1jQmxFXC9aUnM4N1wvTjVWeDJndzBTWHVXdUNJIiwibWFjIjoiZDRmNGU1NzIwNzczODg5NTg5MzhhMDM5NTIyYzFkMDAwZjEyOTMyMjk4NWY4MjhiMzgzM2YxNDgyMGY5MDI4OCJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1620241770",
            # "Host": "glidedsky.com",
            # "Origin": "http://glidedsky.com",
            # "Referer": "http://glidedsky.com/login",
            # "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }
        self.page = 1
        self.count = 0

    def getToken(self):
        r = self.session.get(self.login_url,headers=self.headers)
        doc = pq(r.text)
        token = doc('input[name=_token]').attr('value')
        return token

    @log
    def login(self):
        r = self.session.post(
            url=self.login_url,
            headers = self.headers,
            data = {
                '_token': self.getToken(),
                'email': '2380540710@qq.com',
                'password': 'a1222222222'
            })
        if r.status_code == 200:
            return True
        else:
            raise Exception(f'登录失败,状态码:{r.status_code}')

    def parse(self):
            r = self.session.get(self.url,params={'page':self.page},headers=self.headers)
            doc = pq(r.text)
            numList = doc('.col-md-1').text().split()
            numList = [int(num) for num in numList]
            self.count += sum(numList)
            logging.info(f'{self.page}:{self.count}')
            self.page += 1

    def main(self):
        if self.login():
            logging.info('登录成功')
            while self.page <= 1000:
                # self.parse()
                threading.Thread(target=self.parse,args=[]).start()


bb = spider()
bb.main()


376 字