该项目可能是最全中华古诗词数据库, 唐宋两朝近一万四千古诗人, 接近5.5万首唐诗加26万宋诗. 两宋时期1564位词人,21050首词。今日start超过1700+,位居GitHub热门榜首,堪比当时微软发布的Teamial
终端。
0.截图
1.背后的故事
为什么要做这个仓库? 古诗是中华民族乃至全世界的瑰宝,我们应该传承下去,虽然有古典文集,但大多数人并没有拥有这些书籍。从某种意义上来说,这些庞大的文集离我们是有一定距离的。而电子版方便拷贝,所以此开源数据库诞生了。此数据库通过 JSON 格式分发,可以让你很方便的开始你的项目。 —–<项目作者介绍>
相关文档:https://ijg.io/r/words/crawl-ci.html
2.诗词爬取过程及数据分析
爬取逻辑没有做相应的系统化处理, 只是简单的脚本, 配置交互式界面做的操作。采用的相关技术: Python + parsel + peewee + requests + jieba
该项目使用连个python脚本,内容如下:
# -*- coding: utf-8 -*-
'''
File Name: parser.py
Author: JackeyGao
mail: gaojunqi@outlook.com
'''
import sys
import random
import time
import requests
import re
from parsel import Selector
from peewee import IntegrityError
from db import Ci
from db import CiAuthor
header = {
"Connection": "keep-alive",
"Origin": "http://qsc.zww.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "http://qsc.zww.cn/",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
"Cookie": "Hm_lvt_12506b8a4147836b0046047de09b2a2e=1493688567; _D_SID=92CED13DD066A18AEC64F1086BA2B715; ASPSESSIONIDSABSRATC=OOFAEFEAJAGIAIEMGGAEDBNL; UM_distinctid=15c6821bb13453-0fd27be8dc79a5-30657509-13c680-15c6821bb14468; CNZZDATA618132=cnzz_eid%3D761011847-1496395659-null%26ntime%3D1496395659"
}
reload(sys)
sys.setdefaultencoding('utf-8')
seek_patt = re.compile(r"\((.*?)\)", re.I|re.X)
# ---------------------------------
class QTSBase(object):
def filllist(self, content):
self.content = content
def fillpage(self, fillpage):
self.page = fillpage
def fillbody(self, content):
self.content = content
class ParentBase(object):
def __init__(self):
self.QTS = QTSBase()
# ----------------------------------
parent = ParentBase()
exec("parent.QTS.fillpage('第1页 共92页 1564条')")
def __with_seek_type__(seek_type):
def request(pageno, value=''):
url = 'http://qsc.zww.cn/getdata.asp'
payload = {
'seektype': seek_type,
'seekvalue': value,
'pageno': int(pageno)
}
resp = requests.post(
url,
data=payload,
headers=header
)
return resp
return request
def parse(html, callback, *args, **kwargs):
html = html.decode('utf8')
html = html.encode('latin1')
html = html.decode('gb2312', 'ignore')
sel = Selector(text=html)
return callback(sel, *args, **kwargs)
def callback_author_list(sel, *args, **kwargs):
data = sel.xpath('//script').extract()[0]
for l in data.splitlines():
if not l.startswith('parent.QTS.filllist'):
continue
exec(l)
sel = Selector(
text=unicode(parent.QTS.content)
)
for i in sel.xpath('//a'):
seek = i.xpath('@onclick').extract()[0]
seek = seek_patt.findall(seek)[0]
_type, value, pageno = seek.split(',')
text = i.xpath('text()').extract()[0]
if _type != '10':
continue
name = text.replace('…', '')
# save author to database.
try:
CiAuthor.create(
value = value,
name = name
)
print("主键%s, 已创建." % value)
except IntegrityError:
print("重复主键%s, 已跳过." % value)
def callback_author_info(sel, *args, **kwargs):
data = sel.xpath('//script').extract()[0]
for l in data.splitlines():
if not l.startswith('parent.QTS.fillbody'):
continue
exec(l)
sel = Selector(
text=unicode(parent.QTS.content)
)
ds = sel.xpath('//text()').extract()
name = sel.xpath('//text()').extract()[1]
lon = ''.join([s.strip() for s in ds[5:]]).strip()
author = kwargs["author"]
author.long_desc = lon
#author.short_desc = sht
author.save()
print("主键%s(%s), 已更新" % (author.value, author.name))
return sel
return sel
def callback_ci_info(sel, *args, **kwargs):
data = sel.xpath('//script').extract()[0]
for l in data.splitlines():
if not l.startswith('parent.QTS.fillbody'):
continue
if '宋体' in l:
continue
exec(l)
sel = Selector(
text=unicode(parent.QTS.content)
)
value = kwargs["seekid"]
rhythmic = sel.xpath('//b/text()').extract()[0]
author = sel.xpath('//text()').extract()[1]
contents = sel.xpath('//text()').extract()[2:]
content = '\n'.join(contents)
try:
Ci.create(
value = value,
rhythmic = rhythmic,
author = author,
content = content
)
print("主键%s, 已创建." % value)
except IntegrityError:
Ci.update(
rhythmic = rhythmic,
author = author,
content = content
).where(
Ci.value == value
).execute()
print("重复主键%s, 已更新." % value)
return sel
f_author_list = __with_seek_type__(1)
f_author_info = __with_seek_type__(10)
f_ci_list = __with_seek_type__(5)
f_ci_info = __with_seek_type__(9)
#resp = f_ci_info(1, value=1460)
#sel = parse(resp.text, callback_ci_info, seekid=1)
if __name__ == '__main__':
for p in range(1, 93):
resp = f_author_list(p, value=1)
sel = parse(resp.text, callback_author_list)
# crawl author info
for i in CiAuthor.select().where(CiAuthor.value>0):
resp = f_author_info(1, value=i.value)
sel = parse(resp.text, callback_author_info, author=i)
# crawl author ci list
for i in range(1, 21051):
try:
resp = f_ci_info(1, value=i)
except requests.exceptions.ConnectionError as e:
wait_seconds = random.choice(range(1, 10))
print("等待%s..异常(%s)" % (wait_seconds, str(e)))
time.sleep(wait_seconds)
continue
sel = parse(resp.text, callback_ci_info, seekid=i)
crawl_songci_parse.py hosted with ❤ by GitHub
# -*- coding: utf-8 -*-
'''
File Name: db.py
Author: JackeyGao
mail: junqi.gao@shuyun.com
'''
from peewee import *
db = SqliteDatabase('ci.db')
class CiAuthor(Model):
value = IntegerField(primary_key=True)
name = CharField()
long_desc = TextField(null=True)
short_desc = TextField(null=True)
class Meta:
database = db # This model uses the "people.db" database.
class Ci(Model):
value = IntegerField(primary_key=True)
rhythmic = CharField()
author = CharField()
content = TextField(null=True)
class Meta:
database = db
#def delete_note():
# return Note.delete().execute()
#
#def delete_image():
# return Image.delete().execute()
def init_table():
db.connect()
db.create_tables([Ci, CiAuthor])
if __name__ == '__main__':
init_table()
crawl_songci_db.py hosted with ❤ by GitHub
3.运行
分别保存上面两个脚本为parse.py和db.py, 然后执行以下命令
pip install peewee parsel requests
python db.py # 初始化数据库
python parse.py
4.相关案例
- 中文诗歌主页是一个基于浏览器的诗词网站,包含唐诗三百首、宋词三百首等文集。
- animalize / QuanTangshi 离线全唐诗 Android
- justdark / pytorch-poetry-gen a char-RNN based on pytorch
- Clover27 / ancient-Chinese-poem-generator Ancient-Chinese-Poem-Generator
- chinese-poetry / poetry-calendar 诗词周历
- chenyuntc / pytorch-book 简体唐诗生成(char-RNN),可生成藏头诗,自定义诗歌意境,前缀等
- okcy1016 / poetry-desktop 诗词桌面
- huangjianke / weapp-poem 诗词墨客 小程序版
5.点评
该项目,爬取结果以json
格式存储,而json
,yaml
等作为格式界最欢迎、也是最常用的数据记录格式,因此其他各种语言的程序都可以轻松读取,也非常适合做成API以供各种调用。古诗是中华民族乃至全世界的瑰宝,我们应该传承下去。最后附录随机看到的一首诗词:
定风波·南海归赠王定国侍人寓娘
宋·苏轼
常羡人间琢玉郎,
天应乞与点酥娘。
尽道清歌传皓齿,
风起,雪飞炎海变清凉。
万里归来颜愈少,
微笑,笑时犹带岭梅香。
试问岭南应不好,
却道:此心安处是吾乡。