终身高级VIP会员
- 资源币
- 44
- 积分
- 234
- 贡献
- 0
- 在线时间
- 22 小时
- 注册时间
- 2020-4-10
- 最后登录
- 2023-2-6
|
本帖最后由 请我喝茶 于 2020-5-5 14:15 编辑
# -*- coding: utf-8 -*-
# Author: 桑葚ICE
# Email: 152516cc@gmail.com
# Blog: iicey.github.io
# JueJin: juejin.im/user/5c64dce8e51d45013c40742c
import re
import time
import requests
from scrapy import Selector
class Spider:
def __init__(self, un="", pwd="", cookie=""):
self.un = un
self.pwd = pwd
self.cookies = {}
self.headers = {"cookie": cookie}
self.bd_url_info = {}
def enter(self):
params = {
"mod": "logging",
"action": "login",
"loginsubmit": "yes",
"infloat": "yes",
"lssubmit": "yes",
"inajax": "1"
}
data = {
"fastloginfield": "username",
"username": self.un,
"password": self.pwd,
"quickforward": "yes",
"handlekey": "ls",
}
response = requests.post(
'https://www.zygx8.com/member.php',
headers=self.headers,
params=params,
data=data
)
self.cookies = requests.utils.dict_from_cookiejar(response.cookies)
def fid_to_tid(self, fid, page=1, tid_s=None):
"""
:param fid:
:param page:
:return:
"""
params = (
('mod', 'forumdisplay'),
('fid', fid),
('page', page),
('t', '5104641'),
)
response = requests.get(
'https://www.zygx8.com/forum.php',
headers=self.headers,
# cookies=self.cookies,
params=params
)
ret = Selector(response)
tid_l = ret.xpath('//*[@id="threadlisttableid"]/tbody[contains(@id,"normalthread")]/@id').extract()
if not tid_s:
tid_s = set()
old_count = len(tid_s)
for i in tid_l:
tid_s.add(i.replace("normalthread_", ""))
new_count = len(tid_s)
if new_count != old_count:
page += 1
return self.fid_to_tid(fid, page=page, tid_s=tid_s)
else:
return tid_s
def get_content(self, fid, tid):
response = requests.get(
f"https://www.zygx8.com/forum.php?mod=viewthread&tid={tid}&extra=page%3D1",
headers=self.headers
)
if "如果您要查看本帖隐藏内容请" in response.text:
self.post_content(fid, tid)
time.sleep(60)
return self.get_content(fid, tid)
else:
# time.sleep(1)
result = Selector(response)
url_info = result.xpath('//div[@class="showhide"]//text()').extract()
try:
bd_url = [i for i in url_info if "https://" in i][0]
bd_pwd = re.findall(r"\w\w\w\w", [i for i in url_info if "提取码" in i][0])[0]
print(bd_url, bd_pwd)
self.bd_url_info[bd_url] = bd_pwd
except IndexError as e:
print(e, response.url)
def post_content(self, fid, tid):
"""
:param fid:
:param tid:
:return:
"""
params = (
('mod', 'post'),
('infloat', 'yes'),
('action', 'reply'),
('fid', fid),
('extra', ''),
('tid', tid),
('replysubmit', 'yes'),
('inajax', '1'),
)
data = {
'formhash': '78484d61',
'handlekey': 'reply',
'noticeauthor': '',
'noticetrimstr': '',
'noticeauthormsg': '',
'usesig': '0',
'subject': '',
'message': '666'
}
response = requests.post('https://www.zygx8.com/forum.php',
headers=self.headers, params=params, data=data)
def main(self):
pass
if __name__ == '__main__':
cookie = "浏览器F12打开取出cookie放到这里"
spider = Spider(cookie=cookie)
for fid in [158]:
for tid in spider.fid_to_tid(fid):
spider.get_content(fid, tid)
print(spider.bd_url_info)
|
|