• 欢迎访问佰阅部落
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏吧

Chinese-Poetry: 最全中文诗歌古典文集数据库(今日热门)

数据可视化 佰阅 3年前 (2019-12-28) 2810次浏览 0个评论

该项目可能是最全中华古诗词数据库, 唐宋两朝近一万四千古诗人, 接近5.5万首唐诗加26万宋诗. 两宋时期1564位词人,21050首词。今日start超过1700+,位居GitHub热门榜首,堪比当时微软发布的Teamial终端。

0.截图

Chinese-Poetry: 最全中文诗歌古典文集数据库(今日热门)

1.背后的故事

为什么要做这个仓库? 古诗是中华民族乃至全世界的瑰宝,我们应该传承下去,虽然有古典文集,但大多数人并没有拥有这些书籍。从某种意义上来说,这些庞大的文集离我们是有一定距离的。而电子版方便拷贝,所以此开源数据库诞生了。此数据库通过 JSON 格式分发,可以让你很方便的开始你的项目。 —–<项目作者介绍>

相关文档:https://ijg.io/r/words/crawl-ci.html

2.诗词爬取过程及数据分析

爬取逻辑没有做相应的系统化处理, 只是简单的脚本, 配置交互式界面做的操作。采用的相关技术: Python + parsel + peewee + requests + jieba

该项目使用连个python脚本,内容如下:

# -*- coding: utf-8 -*-
'''
File Name: parser.py
Author: JackeyGao
mail: gaojunqi@outlook.com
'''
import sys
import random
import time
import requests
import re
from parsel import Selector
from peewee import IntegrityError
from db import Ci
from db import CiAuthor

header = {
    "Connection": "keep-alive",
    "Origin": "http://qsc.zww.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer": "http://qsc.zww.cn/",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
    "Cookie": "Hm_lvt_12506b8a4147836b0046047de09b2a2e=1493688567; _D_SID=92CED13DD066A18AEC64F1086BA2B715; ASPSESSIONIDSABSRATC=OOFAEFEAJAGIAIEMGGAEDBNL; UM_distinctid=15c6821bb13453-0fd27be8dc79a5-30657509-13c680-15c6821bb14468; CNZZDATA618132=cnzz_eid%3D761011847-1496395659-null%26ntime%3D1496395659"
}

reload(sys)
sys.setdefaultencoding('utf-8')


seek_patt = re.compile(r"\((.*?)\)", re.I|re.X)

# ---------------------------------
class QTSBase(object):

    def filllist(self, content):
        self.content = content

    def fillpage(self, fillpage):
        self.page = fillpage

    def fillbody(self, content):
        self.content = content


class ParentBase(object):

    def __init__(self):
        self.QTS = QTSBase()

# ----------------------------------

parent = ParentBase()


exec("parent.QTS.fillpage('第1页 共92页 1564条')")


def __with_seek_type__(seek_type):
    def request(pageno, value=''):
        url = 'http://qsc.zww.cn/getdata.asp'
        payload = {
            'seektype': seek_type,
            'seekvalue': value,
            'pageno': int(pageno)
        }

        resp = requests.post(
            url,
            data=payload,
            headers=header
        )

        return resp

    return request


def parse(html, callback, *args, **kwargs):
    html = html.decode('utf8')
    html = html.encode('latin1')
    html = html.decode('gb2312', 'ignore')
    sel = Selector(text=html)
    return callback(sel, *args, **kwargs)


def callback_author_list(sel, *args, **kwargs):
    data = sel.xpath('//script').extract()[0]
    for l in data.splitlines():
        if not l.startswith('parent.QTS.filllist'):
            continue

        exec(l)

        sel = Selector(
            text=unicode(parent.QTS.content)
        )

        for i in sel.xpath('//a'):
            seek = i.xpath('@onclick').extract()[0]
            seek = seek_patt.findall(seek)[0]
            _type, value, pageno = seek.split(',')
            text = i.xpath('text()').extract()[0]

            if _type != '10':
                continue

            name = text.replace('…', '')

            # save author to database.
            try:
                CiAuthor.create(
                    value = value,
                    name = name
                )
                print("主键%s, 已创建." % value)
            except IntegrityError:
                print("重复主键%s, 已跳过." % value)


def callback_author_info(sel, *args, **kwargs):
    data = sel.xpath('//script').extract()[0]
    for l in data.splitlines():
        if not l.startswith('parent.QTS.fillbody'):
            continue

        exec(l)

        sel = Selector(
            text=unicode(parent.QTS.content)
        )

        ds = sel.xpath('//text()').extract()

        name = sel.xpath('//text()').extract()[1]

        lon = ''.join([s.strip() for s in ds[5:]]).strip()

        author = kwargs["author"]
        author.long_desc = lon
        #author.short_desc = sht
        author.save()
        print("主键%s(%s), 已更新" % (author.value, author.name))

        return sel
    return sel


def callback_ci_info(sel, *args, **kwargs):
    data = sel.xpath('//script').extract()[0]
    for l in data.splitlines():
        if not l.startswith('parent.QTS.fillbody'):
            continue

        if '宋体' in l:
            continue

        exec(l)

        sel = Selector(
            text=unicode(parent.QTS.content)
        )

        value = kwargs["seekid"]

        rhythmic = sel.xpath('//b/text()').extract()[0]
        author = sel.xpath('//text()').extract()[1]

        contents = sel.xpath('//text()').extract()[2:]

        content = '\n'.join(contents)

        try:
            Ci.create(
                value = value,
                rhythmic = rhythmic,
                author = author,
                content = content
            )
            print("主键%s, 已创建." % value)
        except IntegrityError:
            Ci.update(
                rhythmic = rhythmic,
                author = author,
                content = content
            ).where(
                Ci.value == value
            ).execute()

            print("重复主键%s, 已更新." % value)

        return sel


f_author_list = __with_seek_type__(1)
f_author_info = __with_seek_type__(10)
f_ci_list = __with_seek_type__(5)
f_ci_info = __with_seek_type__(9)


#resp = f_ci_info(1, value=1460)
#sel = parse(resp.text, callback_ci_info, seekid=1)

if __name__ == '__main__':
    for p in range(1, 93):
        resp = f_author_list(p, value=1)
        sel = parse(resp.text, callback_author_list)


    # crawl author info
    for i in CiAuthor.select().where(CiAuthor.value>0):
        resp = f_author_info(1, value=i.value)
        sel = parse(resp.text, callback_author_info, author=i)


    # crawl author ci list
    for i in range(1, 21051):
        try:
            resp = f_ci_info(1, value=i)
        except requests.exceptions.ConnectionError as e:
            wait_seconds = random.choice(range(1, 10))
            print("等待%s..异常(%s)" % (wait_seconds, str(e)))

            time.sleep(wait_seconds)
            continue

        sel = parse(resp.text, callback_ci_info, seekid=i)

crawl_songci_parse.py hosted with ❤ by GitHub

# -*- coding: utf-8 -*-
'''
File Name: db.py
Author: JackeyGao
mail: junqi.gao@shuyun.com
'''

from peewee import *

db = SqliteDatabase('ci.db')

class CiAuthor(Model):
    value = IntegerField(primary_key=True)
    name = CharField()
    long_desc = TextField(null=True)
    short_desc = TextField(null=True)

    class Meta:
        database = db # This model uses the "people.db" database.

class Ci(Model):
    value = IntegerField(primary_key=True)
    rhythmic = CharField()
    author = CharField()
    content = TextField(null=True)

    class Meta:
        database = db


#def delete_note():
#    return Note.delete().execute()
#
#def delete_image():
#    return Image.delete().execute()

def init_table():
    db.connect()
    db.create_tables([Ci, CiAuthor])

if __name__ == '__main__':
    init_table()

crawl_songci_db.py hosted with ❤ by GitHub

3.运行

分别保存上面两个脚本为parse.pydb.py, 然后执行以下命令

pip install peewee parsel requests
python db.py # 初始化数据库
python parse.py

4.相关案例

5.点评

该项目,爬取结果以json格式存储,而json,yaml等作为格式界最欢迎、也是最常用的数据记录格式,因此其他各种语言的程序都可以轻松读取,也非常适合做成API以供各种调用。古诗是中华民族乃至全世界的瑰宝,我们应该传承下去。最后附录随机看到的一首诗词:

定风波·南海归赠王定国侍人寓娘

宋·苏轼

常羡人间琢玉郎,

天应乞与点酥娘。

尽道清歌传皓齿,

风起,雪飞炎海变清凉。

万里归来颜愈少,

微笑,笑时犹带岭梅香。

试问岭南应不好,

却道:此心安处是吾乡。

 


佰阅部落 , 版权所有丨如未注明 , 均为原创丨本网站采用BY-NC-SA协议进行授权
转载请注明原文链接:Chinese-Poetry: 最全中文诗歌古典文集数据库(今日热门)
喜欢 (1)

您必须 登录 才能发表评论!