|
- from time import time
" B( C* E7 Z: T, p8 c+ \( Q2 M - import requests0 E+ ?. ~5 b* l [" V
- import re4 j1 P$ A& V2 C5 @2 n+ \
- import os7 s. m+ \; x% F# A$ c ~% D' a, t
- from multiprocessing.dummy import Pool
& \* d% Z9 v7 x8 m3 o% {
; z _% B# l- [- start_url = 'https://www.kanunu8.com/book4/10365/'
K2 Q1 u& P, F! |- x' _/ e) q
$ _ o0 z$ x+ Z- def get_source(url): ?6 E# G2 [& P2 z2 S/ J
- html = requests.get(url)) [$ E6 u7 B* L; }- Y( B7 g
- return html.content.decode('gbk')0 e/ D" | ^( N9 t! }% h I
- 4 i' Y/ s9 N4 m
- def get_toc(html):
+ q9 x& x- M, T' a - 0 ?/ h2 t# [: x5 P; G+ T2 t
- toc_url_list = []
6 p4 y1 n' P% o0 n& o5 z1 g - toc_block = re.findall('正文(.*?)</tbody>',html,re.S)[0]
* I' `$ I& R- a! y% G$ I* P' I - toc_url = re.findall('href="(.*?)"',toc_block,re.S)
8 r( I- ~" u$ w( M% h+ F4 ? - for url in toc_url:# v8 A, ]# n9 {4 D, k) B' e& G
- toc_url_list.append(start_url+url)$ ^. d' a5 Z g' _) T
- return toc_url_list
$ D6 N0 _7 |; W6 k5 O. ]/ M7 j
4 B, e2 ]) g2 S% u6 F( Y" n, \' D K- def get_article(html):
3 s; Z$ `) X- k* p* S, ^ - ) S8 l1 C. j* u4 @* z1 i
- chapter_name = re.search('size="4">(.*?)<',html,re.S).group(1)- h/ b1 J7 z) d1 `/ w# p! m3 z
- text_block = re.search('<p>(.*?)</p>',html,re.S).group(1)5 v3 U O/ g( C- @/ G$ E3 x
- text_block = text_block.replace('<br />','')
0 u8 x6 X& p* E! `$ Z) f& w9 p - text_block = text_block.replace(' ','')! ?8 u; d2 C+ X( B2 Y) }8 y5 ~
- return chapter_name,text_block
1 g8 ]# {' L3 K6 J% _
- C; }2 V" D4 R( S- e- Q" q7 f- def save(chapter,article):, V D2 [7 U% `& T, F0 z I" M0 O. J
- os.makedirs('罗密欧与朱丽叶',exist_ok=True)9 Z ]' i- b% n) T: s# ?
- with open(os.path.join('罗密欧与朱丽叶',chapter + '.txt'),'w',encoding='utf-8') as f:
7 E$ M# ^+ b8 A! s) l - f.write(article)
) V3 x/ L- B3 K C( s* \ - print('下载成功--->',chapter, '.txt')
' s6 S" O, \0 ^% u, s$ X
! q7 W, \% q( b$ W3 k- def query_article(url):
6 q3 W+ k1 g7 ?3 ~ - article_html = get_source(url)( r# M- [- o3 N) f
- chapter_name,article_text = get_article(article_html), o5 L* u0 M8 G/ W* C6 p1 Q
- save(chapter_name,article_text)3 z+ y& y( N& m$ G& b/ t/ a
' ~) V4 s( ~. i, S2 Z t- if __name__ == '__main__':
( y, ?1 l8 J3 p6 t# A - a = get_source(start_url)4 K0 _8 `# O: E, I ?* F( }
- b = get_toc(a)0 T a$ o& H, F, e
- start = time()" k; G; B3 `& V0 J8 C& R2 _# r
- pool = Pool(4)
& E2 r9 ?; G- t - pool.map(query_article, b)" n6 V( x3 {& P" N
- end = time()
' X1 h$ N- {4 n' @0 o. G; |& n5 H - print(end-start)
# Q% P- h% G0 X& d
复制代码 |
|