曹杰坤的实验报告

第一部分



import json
import os
from time import sleep
from urllib import parse
import requests
import time,random
from fake_useragent import UserAgent

ua = UserAgent()
userAgen = ua.random
def get_adress(bank_name):
    url = "http://www.cninfo.com.cn/new/information/topSearch/detailOfQuery"
    data = {
        'keyWord': bank_name,
        'maxSecNum': 10,
        'maxListNum': 5,
    }
    hd = {
        'Host': 'www.cninfo.com.cn',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip,deflate',
        'Connection': 'keep-alive',
        'Content-Length': '70',
        'User-Agent':userAgen,
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json,text/plain,*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    get_json = requests.post(url, headers=hd, data=data)
    data_json = get_json.content
    toStr = str(data_json, encoding="utf-8")
    last_json = json.loads(toStr)
    orgId = last_json["keyBoardList"][0]["orgId"]  # 获取参数
    plate = last_json["keyBoardList"][0]["plate"]
    code = last_json["keyBoardList"][0]["code"]
    return orgId, plate, code

def download_PDF(url, file_name):  # 下载pdf
    url = url
    r = requests.get(url)
    f = open(company + "/" + file_name + ".pdf", "wb")
    f.write(r.content)


def get_PDF(orgId, plate, code):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    data = {
        'stock': '{},{}'.format(code, orgId),
        'tabName': 'fulltext',
        'pageSize': 20,
        'pageNum': 1,
        'column': plate,
        'category': 'category_ndbg_szsh;',
        'plate': '',
        'seDate': '',
        'searchkey': '',
        'secid': '',
        'sortName': '',
        'sortType': '',
        'isHLtitle': 'true',
    }

    hd = {
        'Host': 'www.cninfo.com.cn',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip,deflate',
        'Connection': 'keep-alive',
        'User-Agent': ua.random,
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json,text/plain,*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'X-Requested-With': 'XMLHttpRequest',
    }
    data = parse.urlencode(data)
    data_json = requests.post(url, headers=hd, data=data)
    toStr = str(data_json.content, encoding="utf-8")
    last_json = json.loads(toStr)
    reports_list = last_json['announcements']
    for report in reports_list:
        if '摘要' in report['announcementTitle'] or "20" not in report['announcementTitle']:
            continue
        if 'H' in report['announcementTitle']:
            continue
        else:  # http://static.cninfo.com.cn/finalpage/2019-03-29/1205958883.PDF
            pdf_url = "http://static.cninfo.com.cn/" + report['adjunctUrl']
            file_name = report['announcementTitle']
            print("正在下载:" + pdf_url, "存放在当前目录:/" + company + "/" + file_name)
            download_PDF(pdf_url, file_name)
            time.sleep(random.random()*3)


if __name__ == '__main__':
    list = ['000012','000023','000401','000672','000786','000789','000795','000877','000935','002066']
    for company in list:
        os.mkdir(company)
        orgId, plate, code=get_adress(company)
        get_PDF(orgId, plate, code)
        print("下载成功")


结果一

结果截图 结果截图

第二部分



import pdfplumber
import re
import os
import numpy as np
import sys
import csv
import time,datetime

path1="D:\期末实验报告\年报下载\000012"#定义上一步存放pdf所在文件夹
path2="D:\期末实验报告\数据获取"#定义提取后的数据存放文件夹
temfilename = "数据"
name_list=os.listdir(path1)#利用os库获取路径1下的pdf文件名称
print(name_list)

x='000012'
def creat_csv(x,csv_head):
    path = str(path2+"\\"+"(%s).csv" % (x))
    with open(path,'w',newline=''                                                                                                                                                                                                                                                                                                                                                                                                                              ,encoding='utf-8_sig') as f:
        csv_write = csv.writer(f)
        csv_write.writerow(csv_head)

csv_head = ["营业收入(元)","基本每股收益(元/股)","股票简称",
"股票代码","办公地址","公司网址"]
creat_csv(x,csv_head)

def write_csv(x,data_row):
    path = str(path2+"\\"+temfilename+"(%s).csv" % (x))
    with open(path,mode="a",newline = "",encoding="utf-8_sig") as f:
        csv_write = csv.writer(f)
        csv_write.writerow(data_row)


import fitz
import re
import pandas as pd

doc = fitz.open('2021年度报表.pdf')

class NB():
    '''
    解析上市公司年度报告
    '''
    def __init__(self,pdf_filename):
        self.doc = fitz.open(pdf_filename)
        self.pdf_name = pdf_filename
        self.get_toc()
        self.jie_pages_title()
        self.get_key_findata_pages()
        self.get_target_page()

    def get_toc(self):
        jie_zh = '一二三四五六七八九十'
        p = re.compile('(第[%s]{1,2}节)\s+(\w[、\w]*\w)' % jie_zh)
        toc = []
        for page in self.doc:
            txt = page.get_text()
            match = p.findall(txt)
            if len(match) != 0:
                first_match = match[0]
                toc.append((first_match, page.number))
        #
        self.toc = toc

    def jie_pages_title(self):
        toc = self.toc
        jie_pages = {}
        jie_title = {}
        for t in toc:
            jie, title, pageNumber = t[0][0], t[0][1], t[1]
            if jie in jie_pages:
                jie_pages[jie].append(pageNumber)
            else:
                jie_pages[jie] = [pageNumber]
                jie_title[jie] = title
        self.jie_pages = jie_pages
        self.jie_title = jie_title

    def get_key_findata_pages(self):
        pages = ''
        jie_title = self.jie_title
        titles = ['公司简介和主要财务指标', '会计数据和财务指标摘要']
        for jie in jie_title:
            title = jie_title[jie]
            if title in titles: pages = self.jie_pages[jie]; break
        if pages == '':
            Warning('没有找到“公司简介和主要财务指标”或“会计数据和财务指标摘要”')
        #
        self.key_fin_data_pages = pages
        return(pages)

    def get_target_page(self):
        pages = self.key_fin_data_pages
        pattern = re.compile('主要会计数据和财务指标.*?营业收入', re.DOTALL)
        target_page = ''
        for p in pages:
            page = self.doc[p]
            txt = page.get_text()
            matchObj = pattern.search(txt)
            if matchObj is not None:
                target_page = p
                break
        if target_page == '':
            Warning('没找到“主要会计数据和财务指标”页')
        self.key_fin_data_page = target_page
        return(target_page)


结果二

结果截图 结果截图

第三部分



os.chdir('..\\')#为了后续将图片保存在父文件夹中
 name_list = ['2021',"2020","2019","2018","2017","2016","2015","2014",'2013','2012']
 list_name_1 = ['000012','000023','000401','000672','000786','000789','000795','000877','000935','002066']
 def y_ticks(list_row,list_name_1):
     num_list_1 = list_row
     rects = plt.barh(range(len(list_row)),num_list_1,color='rgby')
     N = 10
     index = np.arange(N)
     plt.yticks(index,name_list)
     plt.title(list_name_1+" 2012—2021营业收入对比")
     plt.xlabel("年份")
     plt.ylabel("营业收入(亿元)")
     for rect in rects:
         w=rect.get_width()
         plt.text(w,rect.get_y()+rect.get_height()/2,w,size =10,ha='left',va='center')
     plt.savefig(list_name_1 +".png",dpi = 600)
     plt.show()
 for i in range(len(list_row)):
     y_ticks(list_row_1[i], list_name_1[i])

 def x_ticks2(list_columns_profit,name_list):
     num_list = list_columns_profit
     rects = plt.bar(range(len(list_columns_profit)),num_list,color="rgb",width = 1,tick_label=list_name_1)
     plt.title(name_list+"不同公司每股净收益对比")
     plt.xlabel("每股净收益(元)")
     plt.ylabel("公司名称")
     for rect in rects:
         height = rect.get_height()
         plt.text(rect.get_x() + rect.get_width() / 2, height, str(height), size=10, ha="center", va="bottom")
     plt.savefig(name_list +"净收益.png",dpi = 600)
     plt.show()
 for i in range(len(list_columns_profit)):
     x_ticks2(list_columns_profit[i], name_list[i])


结果三

营业收入与基本每股收益变化趋势折线图

结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图

解释

从营业收入与基本每股收益变化趋势折线图中可以看出这些行业的领头企业,在近十年的发展过程中一直保持这稳步上升的势头,并且观察变动趋势可以得出其在2016、2017年左右出现较大幅度的增长,通过经济学方面的知识与素养猜测可能与那几年的房地产热有关。从总体来看该行业仍然保持着增长的势头,但却由于生态环保的需要也面临着巨大的转型压力。

总结

一个学期以来的金融数据获取与处理课程的学习也基本告一段落,这门课程对于初学者来说是一次巨大的挑战,但对我来说同样也是一次机遇,一次提升学习能力、计算机技术能力等综合能力的机遇,尽管今年由于疫情等不可控风险,我们面临着线上学习的困难,但这一个学期也挺过来了,在这一过程中我学会了很多,但我也更加明白了自己能力的不足以及与其他同学的差距,在今后的日后仍然要继续保持学习的习惯,加油!