笔趣阁小说网 「练习」爬取 新笔趣阁 小说
原文标题:「练习」爬取 新笔趣阁 小说笔趣阁小说网
原文发布时间:2017-04-15 13:36:36
原文作者:CM無可厚非。
如果您喜欢本文,请关注头条号【CM無可厚非】阅读更多相关文章。
如果您是本文作者,不希望我们转载此文,请联系我们删除。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 新笔趣阁
' xxbiquge'
__author__ = 'Wang930607'
import pprint,csv,urllib,urllib2,re,datetime
import traceback,time,json
#=========================
import socket
# 统一编码
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 获得系统默认编码格式'
sysCharType = sys.getfilesystemencoding()
# sys.setdefaultencoding( "utf-8" )
#=========================
#导入 bs4 库
from bs4 import BeautifulSoup
#=========================
# 输出给定集合的所有排列与组合
# from itertools import product
#=========================
# 读取xls文件
import xlrd
# 写入xls文件
import xlwt
#=========================
from datetime import date,datetime
#=========================
# 按任意键继续
import os
#=========================
# 数据持久存储
try:
import cPickle as pickle
except :
import pickle
#=========================
# 自定义程序 start
#=========================
#导出到文本文件
def save_file(file,data,hrefs):
try:
# 设置文件名
file_name = file+'.txt'
# 打开文件
fp = open(file_name,'w')
# 写入小说简介
fp.writelines(dict_['intro'])
for href in hrefs:
if dict_[href][0]:
fp.writelines('\n'+'================================================================================='+'\n')
# 写入章节名
fp.writelines(dict_[href][1])
fp.writelines('\n')
# 写入章节正文
fp.writelines(dict_[href][2])
except Exception as e:
# 关闭文件
fp.close()
#素质
def quality():
# 等待时间设置
time.sleep(0.5)
# 控制下载内容的时间
socket.setdefaulttimeout(15)
#存储数据
def dump_pkl(file,data):
# print data
file_name = file+'.pkl'
output = open(file_name, 'wb')
pickle.dump(data, output)
output.close()
#读取数据
def load_pkl(file):
file_name = file+'.pkl'
pkl_file = open(file_name, 'rb')
data = pickle.load(pkl_file)
pkl_file.close()
return data
#修剪内容
def __replace__(str):
content = str.replace("readx(); ()", '')
# 去除空格,换行符,制表符
content = content.strip()
content = content.replace("ff37;w039;30fb;;off4d;ff55;247b;50f;8bf4;66f4;65b0;6700;5feb;50f;8bf4;9605;8bfb;7f51;", '')
content = content.replace("★★★可将您看到的最新章节或 ,方便下次接着看★★★ ---------", '')
content = content.replace("==<!-br/->ww.uos.<!-->由网友上传==", '')
content = content.replace("跟-我-读wen文-xue学-lou楼 记住哦!", '')
content = content.replace("~~www.shushuw.n-更新首发~~", '')
content = content.replace("自从学会了投票,妈妈再也不用担心我闹书荒了", '')
content = content.replace("(未完待续。精彩小说【网】记住我们的网址:", '')
content = content.replace("无节操裸奔求收藏,求推荐,求点击,求抚摸", '')
content = content.replace("新书上传,求收藏,求推荐!卖身求乳啊!", '')
content = content.replace("(/无,弹.窗,小,说.网)(..)", '')
content = content.replace("精彩小说【网】记住我们的网址:", '')
content = content.replace("【w.w.m 1我|】", '')
content = content.replace("wenhangshuyuan", '')
content = content.replace("*************", '')
content = content.replace("(搜读窝.souduwo)", '')
content = content.replace("无弹窗小说网www.RT", '')
content = content.replace("***********", '')
content = content.replace("手机用户同步阅读请访问", '')
content = content.replace("U看书(ww..om)", '')
content = content.replace("<!--over-->", '')
content = content.replace("(第三次发布此章节)", '')
content = content.replace("<!--go-->", '')
content = content.replace("*********", '')
content = content.replace("<!-br/->.", '')
content = content.replace("早起求个票~~~~", '')
content = content.replace("看书要投票啊~~ ", '')
content = content.replace("readx();", '')
content = content.replace("readx();", '')
content = content.replace(" 早起求票", '')
content = content.replace("*******", '')
content = content.replace("ww.x.om", '')
content = content.replace("ww.x.om", '')
content = content.replace(" ()", '')
content = content.replace("早起求几张票", '')
content = content.replace("(未完待续)", '')
content = content.replace("(未完待续。", '')
content = content.replace("*****", '')
content = content.replace("票~~~~", '')
content = content.replace("(网网)w", '')
content = content.replace("wxs.o", '')
content = content.replace("wxs.o", '')
content = content.replace("…………", '')
content = content.replace(" ", '')
content = content.replace(" ", '')
content = content.replace("****", '')
content = content.replace("<!>.", '')
content = content.replace("未完待续", '')
content = content.replace("求推荐票", '')
content = content.replace("求推荐票", '')
content = content.replace("c!!!", '')
content = content.replace("………", '')
content = content.replace("***", '')
content = content.replace("~~~", '')
content = content.replace("()w", '')
content = content.replace("<br />", '')
content = content.replace("……", '')
content = content.replace("**", '')
content = content.replace(";", '')
#========================= u'
return ' '+content
# 自定义程序 end
#==================================================
class Xxbiquge(object):
"""docstring for Xxbiquge"""
def __init__(self):
self.headers = headers
self.soup = soup
#=========================
# 获取标题
def get_title(self):
# <h1>武炼巅峰</h1>
title = str(self.soup.h1.string).decode("utf8")
# print title
return title
#=========================
# 下载封面
def get_cover(self,name='Cover'):
# <meta property="og:image" content="http://www.xxbiquge.com/cover/0/347/347s.jpg"/>
try:
img_url = self.soup.find('meta',property="og:image").get('content')
urllib.urlretrieve(img_url,name+'.jpg')
except Exception as e:
print '下载封面失败'
print "-----------"
#=========================
# 简介
def get_intro(self):
# <meta property="og:description" content="武之巅峰,是孤独,是寂寞,是漫漫求索,是高处不胜寒逆境中成长,绝地里求生,不屈不饶,才能堪破武之极道。凌霄阁试炼弟子兼扫地小厮杨开偶获一本无字黑书,从此踏上漫漫武道。"/>
# intro = self.soup.find('div',id="intro").get_text()
# print soup.prettify()
intro1 = self.soup.find('div',id="info").get_text()
# print self.soup.find_all('div',id="info")
intro2 = self.soup.find('meta',property="og:description").get('content')
# print intro
return intro1+'\n'+'================================================================================='+'\n'+' '+intro2
#=========================
# 获取循环用地址池
def get_urls(self):
# 存放地址池
hrefs=[]
links=self.soup.find_all('a')
for link in links:
if re.match("\/\d+_\d+\/\d+.html",link.get('href')):
# print(link.get('href'))
hrefs.append(link.get('href'))
# 删除多余重复出现的链接
del hrefs[0]
return hrefs
#=========================
# 获取正文
def get_content(self,url='/0_347/1007300.html',plan='0/0'):
# 拼接完整链接
url='http://www.xxbiquge.com'+url
print u'开始下载 '+str(plan)+': '+url
print "-----------"
req = urllib2.Request(url, headers=self.headers)
try:
soup = urllib2.urlopen(req).read()
soup = BeautifulSoup(soup)
# 获取正文标题
title = str(soup.h1.string).decode("utf8")
# 获取正文
content = soup.find('div',id="content").get_text()
# 修剪内容
content= __replace__(content)
# 用长度判断该正文是否争取
if len(content)>1000:
return True,title,content
else:
print url+" 非小说正文 "+str(len(content))
print "-----------"
return False,url+" 非小说正文 "+str(len(content))
except Exception as e:
print u'连接失败'
#=========================
if __name__ == '__main__':
# 自定义参数 start
#主网址 ***************************************更改此处网址即刻
# 例'http://www.xxbiquge.com/0_347/'
#=========================
# No='0_347' #武炼巅峰
# No='0_681' #大圣传
# No='74_74821' #圣墟
No='2_2306' #神墓
# No='75_75151' #天道图书馆
#=========================
url_home='http://www.xxbiquge.com/'+No+'/'
# 头部文件
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
# 打开网页
try:
req = urllib2.Request(url_home, headers=headers)
soup = urllib2.urlopen(req).read()
# 修剪网页内容
soup = __replace__(soup)
soup = BeautifulSoup(soup)
#打印 soup 对象的内容,格式化输出
# print soup.prettify()
except Exception as e:
print u'连接失败'
#创建字典,存放章节内容
dict_={}
# 自定义参数 end
#=========================
# 获取标题
title = Xxbiquge().get_title()
#=========================
file = title
try:
# 创建文件夹
os.mkdir(file)
# 删除空文件夹
os.rmdir(file)
# 创建文件夹
os.mkdir(file)
except Exception as e:
pass
# 设置默认目录
os.chdir(file)
#=========================
try:
# 载入数据
dict_=load_pkl(title)
#写入标题
dict_['title'] = title
#写入简介
dict_['intro'] = Xxbiquge().get_intro()
except Exception as e:
print u'文件不存在,载入失败!'
print "-----------"
# 保存文件标题
# if title not in dict_:
# 下载封面
Xxbiquge().get_cover(title)
#=========================
# 获取循环用地址池
try:
hrefs = Xxbiquge().get_urls()
# 素质一下
quality()
except Exception as e:
raise e
#=========================
for href in hrefs:
# 显示进度
plan = str(hrefs.index(href)+1)+'/'+str(len(hrefs))
if href in dict_:
print 'http://www.xxbiquge.com'+href+' 已下载!'
print "-----------"
else:
list_ = Xxbiquge().get_content(href,plan)
dict_[href] = list_
# 更新数据
dump_pkl(title,dict_)
# 清空字典
# dict_.clear()
# 素质一下
quality()
#=========================
# 导出到文本文件
save_file(title,dict_,hrefs)
正文完,原文标题:「练习」爬取 新笔趣阁 小说
原文发布时间:2017-04-15 13:36:36
原文作者:CM無可厚非。
笔趣阁小说网 笔趣阁小说网