-
Notifications
You must be signed in to change notification settings - Fork 35
/
Setting.py
96 lines (88 loc) · 2.74 KB
/
Setting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
import time
import json
import pymongo
import urllib3
import requests
import Setting
from urllib import request
from bs4 import BeautifulSoup
from pymongo import MongoClient
client=MongoClient('localhost',27017)
db=client.Redbook
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#api地址
def header():
headers = {
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-cn',
'Connection':'keep-alive',
'Device-Fingerprint':,
# 'Cookie':,
'Host':'www.xiaohongshu.com',
'Referer':'https://servicewechat.com/wxffc08ac7df482a27/346/page-frame.html',
'User-Agent':,
'Authorization':,
}
return headers
def html_header():
headers = {
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-cn',
'Connection':'keep-alive',
'Device-Fingerprint': ,
'Cookie':,
'Host':'www.xiaohongshu.com',
'User-Agent':,
'X-Sign':,
}
return headers
# 解析Json数据
def getJsonSession(url):
ses = requests.session()
html = ses.get(url, headers = header(), verify = False)
soup = json.loads(html.text)
return soup
# 解析HTML页面
def getHtmlSession(url):
ses = requests.session()
html = ses.get(url, headers = html_header(), verify = False)
soup = BeautifulSoup(html.content, 'html.parser')
return soup
# 获取笔记类型和id
def getBookId(url):
normalBook = []
videoBook = []
json = getJsonSession(url)
jsonData = json['data']['notes']
for data in jsonData:
if data['type'] == 'normal':
normalBook.append('https://www.xiaohongshu.com/discovery/item/' + data['id'])
else:
videoBook.append('https://www.xiaohongshu.com/discovery/item/' + data['id'])
return normalBook, videoBook
# 获取标题、文本、图片、视频链接 保存至MongoDB
def getData(bookUrl):
picUrls = []
for url in bookUrl[0]:
time.sleep(5)
soup = getHtmlSession(url)
title = soup.find('div', class_ = 'note-top').find('h1', class_ = 'title').get_text()
text = soup.find('div', class_ = 'content').get_text()
pics = soup.find('div', class_ = 'small-pic').find_all('div')
for pic in pics:
picUrl = pic.find('i', class_ = 'img').get('style')[21:-32]
picUrls.append(picUrl)
# 保存到MongoDB
db.normalDatas.insert_many([{
'titleData' : title,
'textData' : text,
'picData' : picUrls
}])
for url in bookUrl[1]:
soup = getHtmlSession(url)
videoSrc = soup.find('div', class_ = 'videoframe').find('video').get('src')
# 保存到MongoDB
db.videoDatas.insert_many([{
'videoUrls': videoSrc
}])