GlidedSky | 爬虫题目1
-
心得
这道题目所需的技能点都很基础,如发送请求、文档解析,很多工具都可以达到目的,重要是要寻找最简短的代码、最优雅的解析方法进行文本的提取,在这里我用的是etree.xpath的方法。十分和谐 🐱
# -*- coding: utf-8 -*-
"""
# Talk is cheap,show me the codes!
@Author billie
@Time 2021/05/05
@Describe
"""
import logging
import requests
from pyquery import PyQuery as pq
logging.basicConfig(
# filename='glidedsky.log',
format=f"%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level = logging.INFO
)
def log(func):
def wrapper(*args,**kwargs):
try:
return func(*args,**kwargs)
except Exception as e:
logging.error(repr(e))
return wrapper
class spider:
def __init__(self):
self.login_url="http://glidedsky.com/login"
self.url = 'http://glidedsky.com/level/web/crawler-basic-1'
self.session = requests.session()
self.headers={
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "Content-Length": "94",
# "Content-Type": "application/x-www-form-urlencoded",
# "Cookie": "footprints=eyJpdiI6IllVVkpGRXYzUzJvbTRvXC85VTc2dExnPT0iLCJ2YWx1ZSI6IlhsbUFmeDRzZFdqcW9rWlFYcm1YbUtBNkszdmRDWExWUGlTNlU5QWxqc21ybVh5NzM5TGlcL3hkdzZ4RlI1bGx0IiwibWFjIjoiZWZjOTk5NzAzMTY1YmNlMDE3M2MwYTA2MzVkYjk3Y2QxMDZlNWI3MDM4YTRhOWYxYzRiYjk5YTA3Yjc2MTJkNSJ9; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1620240916; XSRF-TOKEN=eyJpdiI6IndYcUViWXhaWDNaUEtNeDM0SnJPTGc9PSIsInZhbHVlIjoieWFja3ZQNXd2SHRKeGxQNmI5VDRkTWFTXC9wN3NRUFdqWWtlMWpcL2poczJScXZUVTNBcVdNWXJ3NzIzcnVCZUswIiwibWFjIjoiNDQwZGYyZGY1MjI1MGQ4MDgyNDNlNGE3ODM0NmJkZGM5MjQyNjgxOGEwYWI3YTViODJlNmExNGU1MWM2OWJkMSJ9; glidedsky_session=eyJpdiI6Ink4RzhSNFwvdkdyVHVmYXFzcW4yOFFRPT0iLCJ2YWx1ZSI6InZablo4cEpQK1dJR0VQS3A4QUFxdFRwUlwvSVpSaFFNNWpcL2dlYk1jQmxFXC9aUnM4N1wvTjVWeDJndzBTWHVXdUNJIiwibWFjIjoiZDRmNGU1NzIwNzczODg5NTg5MzhhMDM5NTIyYzFkMDAwZjEyOTMyMjk4NWY4MjhiMzgzM2YxNDgyMGY5MDI4OCJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1620241770",
# "Host": "glidedsky.com",
# "Origin": "http://glidedsky.com",
# "Referer": "http://glidedsky.com/login",
# "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
def getToken(self):
r = self.session.get(self.login_url,headers=self.headers)
doc = pq(r.text)
token = doc('input[name=_token]').attr('value')
return token
@log
def login(self):
r = self.session.post(
url=self.login_url,
headers = self.headers,
data = {
'_token': self.getToken(),
'email': '2380540710@qq.com',
'password': 'a1222222222'
})
if r.status_code == 200:
return True
else:
raise Exception(f'登录失败,状态码:{r.status_code}')
def parse(self):
if self.login():
logging.info('登录成功')
r = self.session.get(self.url,headers=self.headers)
doc = pq(r.text)
numList = doc('.col-md-1').text().split()
numList = [int(num) for num in numList]
count = sum(numList)
logging.info(count)
bb = spider()
bb.parse()