

-
#!/usr/bin/env python -
# coding=utf-8 -
import os -
import time -
import threading -
from multiprocessing import Pool, cpu_count -
-
import requests -
from bs4 import BeautifulSoup -
-
essay-headers = { -
'X-Requested-With': 'XMLHttpRequest', -
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' -
'Chrome/56.0.2924.87 Safari/537.36', -
'Referer': "http://www.mmjpg.com" -
} -
dir_path = r"E:\mmjpg" # 下載圖片儲存路徑 -
def save_pic(pic_src, pic_cnt): -
""" 將圖片下載到本地檔案夾 """ -
try: -
img = requests.get(pic_src, essay-headers=essay-headers, timeout=10) -
imgname = "pic_cnt_{}.jpg".format(pic_cnt + 1) -
with open(imgname, 'ab') as f: -
f.write(img.content) -
print(imgname) -
except Exception as e: -
print(e) -
def make_dir(folder_name): -
""" 新建套圖檔案夾並切換到該目錄下 """ -
path = os.path.join(dir_path, folder_name) -
# 如果目錄已經存在就不用再次爬取了,去重,提高效率。存在傳回 False,否則反之 -
if not os.path.exists(path): -
os.makedirs(path) -
print(path) -
os.chdir(path) -
return True -
print("Folder has existed!") -
return False -
def delete_empty_dir(dir): -
""" 如果程式半路中斷的話,可能存在已經新建好檔案夾但是仍沒有下載的圖片的情況 -
但此時檔案夾已經存在所以會忽略該套圖的下載,此時要刪除空檔案夾 """ -
if os.path.exists(dir): -
if os.path.isdir(dir): -
for d in os.listdir(dir): -
path = os.path.join(dir, d) # 組裝下一級地址 -
if os.path.isdir(path): -
delete_empty_dir(path) # 遞迴刪除空檔案夾 -
if not os.listdir(dir): -
os.rmdir(dir) -
print("remove the empty dir: {}".format(dir)) -
else: -
print("Please start your performance!") # 請開始你的表演 -
-
lock = threading.Lock() # 全域性資源鎖 -
def urls_crawler(url): -
""" 爬蟲入口,主要爬取操作 """ -
try: -
r = requests.get(url, essay-headers=essay-headers, timeout=10).text -
# 套圖名,也作為檔案夾名 -
folder_name = BeautifulSoup(r, 'lxml').find('h2').text.encode('ISO-8859-1').decode('utf-8') -
with lock: -
if make_dir(folder_name): -
# 套圖張數 -
max_count = BeautifulSoup(r, 'lxml').find('div', class_='page').find_all('a')[-2].get_text() -
# 套圖頁面 -
page_urls = [url + "/" + str(i) for i in range(1, int(max_count) + 1)] -
# 圖片地址 -
img_urls = [] -
for index, page_url in enumerate(page_urls): -
result = requests.get(page_url, essay-headers=essay-headers, timeout=10).text -
# 最後一張圖片沒有a標簽直接就是img所以分開解析 -
if index + 1 < len(page_urls): -
img_url = BeautifulSoup(result, 'lxml').find('div', class_='content').find('a').img['src'] -
img_urls.append(img_url) -
else: -
img_url = BeautifulSoup(result, 'lxml').find('div', class_='content').find('img')['src'] -
img_urls.append(img_url) -
-
for cnt, url in enumerate(img_urls): -
save_pic(url, cnt) -
except Exception as e: -
print(e) -
if __name__ == "__main__": -
urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) for cnt in range(1, 953)] -
pool = Pool(processes=cpu_count()) -
try: -
delete_empty_dir(dir_path) -
pool.map(urls_crawler, urls) -
except Exception as e: -
time.sleep(30) -
delete_empty_dir(dir_path) -
pool.map(urls_crawler, urls)
-
import urllib.request -
import re -
# 1 獲取主頁原始碼 -
# 2 獲取章節超連結 -
# 3 獲取章節超連結原始碼 -
# 4 獲取小說內容 -
# 5 下載,檔案操作 -
-
# 駝峰命名法 -
# 獲取小說內容 -
def getNovertContent(): -
# -
html = urllib.request.urlopen("http://www.quanshuwang.com/book/0/269").read() -
html = html.decode("gbk") -
# 不加括號 不匹配 -
# 正則運算式 .*? 匹配所有 -
reg = r' - (.*?)
-
# 增加效率的 -
reg = re.compile(reg) -
urls = re.findall(reg,html) -
# print(urls) -
# 串列 -
# [(http://www.quanshuwang.com/book/0/269/78850.html,第一章 山邊小村), -
# (http://www.quanshuwang.com/book/0/269/78854.html,第二章 青牛鎮)] -
for url in urls: -
# 章節的URL地址 -
novel_url = url[0] -
# 章節標題 -
novel_title = url[1] -
-
chapt = urllib.request.urlopen(novel_url).read() -
chapt_html = chapt.decode("gbk") -
# r 表示原生字串 \ \\d r"\d" -
reg = r'
'
(.*?)
知識星球