王小庆的期末报告
代码
#引用必要模块
import pdfplumber
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import os
from bs4 import BeautifulSoup
import re
import requests
import fitz
import csv
import matplotlib.pyplot as plt
#解析深交所并下载年报
driver_url = r"C:\edgedriver_win64\msedgedriver.exe"
prefs = {'profile.default_content_settings.popups': 0,
'download.default_directory':'C\\Desktop\python\金融数据获取\现代投资'}
options = webdriver.EdgeOptions()
options.add_experimental_option('prefs', prefs)
browser = webdriver.Edge(executable_path=driver_url, options=options)
browser.get('http://www.szse.cn/disclosure/listed/fixed/index.html')
element = browser.find_element(By.ID, 'input_code') # Find the search box
element.send_keys('天茂集团' + Keys.RETURN)
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
browser.find_element(By.LINK_TEXT, "年度报告").click()
data_ele = browser.find_element(By.ID, 'disclosure-table')
innerHTML = data_ele.get_attribute('innerHTML')
f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()
# html = to_pretty('innerHTML.html')
w = open('innerHTML.html',encoding='utf-8')
html = w.read()
w.close()
import re
import pandas as pd
class DisclosureTable():
'''
解析深交所定期报告页搜索表格
'''
def __init__(self, innerHTML):
self.html = innerHTML
self.prefix = 'https://disc.szse.cn/download'
self.prefix_href = 'https://www.szse.cn/'
# 获得证券的代码和公告时间
p_a = re.compile('(.*?)', re.DOTALL)
p_span = re.compile('(.*?)', re.DOTALL)
self.get_code = lambda txt: p_a.search(txt).group(1).strip()
self.get_time = lambda txt: p_span.search(txt).group(1).strip()
# 将txt_to_df赋给self
self.txt_to_df()
def txt_to_df(self):
# html table text to DataFrame
html = self.html
p = re.compile('(.*?)
', re.DOTALL)
trs = p.findall(html)
p2 = re.compile('(.*?)', re.DOTALL)
tds = [p2.findall(tr) for tr in trs[1:]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'简称': [td[1] for td in tds],
'公告标题': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
self.df_txt = df
# 获得下载链接
def get_link(self, txt):
p_txt = '(.*?)'
p = re.compile(p_txt, re.DOTALL)
matchObj = p.search(txt)
attachpath = matchObj.group(1).strip()
href = matchObj.group(2).strip()
title = matchObj.group(3).strip()
return([attachpath, href, title])
def get_data(self):
get_code = self.get_code
get_time = self.get_time
get_link = self.get_link
#
df = self.df_txt
codes = [get_code(td) for td in df['证券代码']]
short_names = [get_code(td) for td in df['简称']]
ahts = [get_link(td) for td in df['公告标题']]
times = [get_time(td) for td in df['公告时间']]
#
prefix = self.prefix
prefix_href = self.prefix_href
df = pd.DataFrame({'证券代码': codes,
'简称': short_names,
'公告标题': [aht[2] for aht in ahts],
'attachpath': [prefix + aht[0] for aht in ahts],
'href': [prefix_href + aht[1] for aht in ahts],
'公告时间': times
})
self.df_data = df
return(df)
f = open('innerHTML.html',encoding='utf-8')
html = f.read()
f.close()
dt = DisclosureTable(html)
df = dt.get_data()
df.to_csv('天茂集团.csv')
df = pd.read_csv('半导体行业.csv')
words = ['摘要','问询函','社会责任']
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任'],df,include=False)
df_original = filter_links(['(','('], df_all)
words = ['摘要','问询函','社会责任']
include = False
for i in range(len(df['attachpath'])):
download_link = df['attachpath'][i]
browser.get(download_link)
try:
browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
time.sleep(5) #这句一定要加,因为下载需要一点时间
browser.quit()
print('下载完毕')
except:
( '下载失败')
#解析上交所并下载年报
def sse_to_dataframe(filename):
f = open(filename+'.html',encoding='utf-8')
html = f.read()
f.close()
p_row=re.compile('(.*?)
',re.DOTALL)
trs=p_row.findall(html)
p_data=re.compile('(.*?)',re.DOTALL)
tds=[p_data.findall(t) for t in trs if p_data.findall(t)!=[]]
p_code=re.compile('(\d{6})')
p_name=re.compile('(\w+|-)')
p_href=re.compile('')
p_title=re.compile('(.*?)')
codes=[p_code.search(td[0]).group(1) for td in tds]
names=[p_name.search(td[1]).group(1) for td in tds]
links=[p_href.search(td[2]).group(1) for td in tds]
titles=[td[3][:4]+p_title.search(td[2]).group(1) for td in tds] #早年有的公司年报标题每年都一样,前面加一个发布年份以区分
pubtime=[td[3] for td in tds]
data=pd.DataFrame({'证券代码':codes,
'股票简称':names,
'公告标题':titles,
'公告链接':links,
'发布时间':pubtime})
for index,row in data.iterrows():
title=row[2]
time=row[-1][:4]
if ("年度报告" not in title and "年报" not in title) or (int(time))<2012:
data=data.drop(index=index)
return(data)
class DisclosureTable_sh():
'''
解析深交所定期报告页搜索表格
'''
def __init__(self, innerHTML):
self.html = innerHTML
self.prefix = 'http://www.sse.com.cn'
p_code=re.compile('(\d{6})')
p_name=re.compile('(\w+|-)')
p_href=re.compile('')
p_title=re.compile('(.*?)')
self.get_code = lambda td: p_code.search(td).group(1)
self.get_name = lambda td: p_name.search(td).group(1)
self.get_href = lambda td: p_href.search(td).group(1)
self.get_title = lambda td: p_title.search(td).group(1)
self.txt_to_df() #调用txt_to_df(self),得到初始化dataframe用于后续匹配
def txt_to_df(self):
# html table text to DataFrame
html = self.html
p_tr = re.compile('(.*?)
', re.DOTALL)
trs = p_tr.findall(html)
p_td = re.compile('(.*?)', re.DOTALL)
tds=[p_td.findall(td) for td in trs if p_td.findall(td)!=[]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'股票简称': [td[1] for td in tds],
'公告标题和链接': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
self.df_txt=df
def get_data(self):
get_code = self.get_code
get_name = self.get_name
get_href = self.get_href
get_title = self.get_title
df = self.df_txt
prefix = self.prefix
codes = [get_code(td) for td in df['证券代码']]
names = [get_name(td) for td in df['股票简称']]
links = [prefix+get_href(td) for td in df['公告标题和链接']]
titles = [td[3][:4]+get_title(td) for td in df['公告标题和链接']]
pubtime = [td for td in df['公告时间']]
data = pd.DataFrame({'证券代码':codes,
'股票简称':names,
'公告标题':titles,
'公告链接':links,
'公告时间':pubtime})
for index,row in data.iterrows():
title = row[2]
time = int(row[-1][:4])
if "年度报告" not in title and "年报" not in title:
data=data.drop(index=index)
if time<2011:
data_latest10=data.drop(index=index)
self.df_alldata = data
self.df_data = data_latest10
return data_latest10
for i in range(len(df_data)):
c = p_bnb.findall(df_data['公告标题'][i])
if len(c) != 0:
df_data.drop([i],inplace = True)
df_data = df_data.drop_duplicates('year', keep='first', inplace=False)
df_data = df_data.reset_index(drop=True)
df_data['year_str'] = df_data['year'].astype(str)
df_data['name'] = name + df_data['year_str'] + '年报'
name1 = df_data['简称'][0]
df_data.to_csv('%.csv'%name1)
year = {'year': ['2012', '2013', '2014','2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']}
dy = pd.DataFrame(year)
os.mkdir('%s年度报告'%name)
os.chdir(r'C:\Users\86191\Desktop\保险业\%s年度报告'%name)
for y in range(len(dy)):
y = int(y)
ye = dy['year'][y]
name1 = df_data['简称'][0]
rename = name1 + ye
for a in range(len(df_data)):
if df_data['name'][a] == '%s年报'%rename:
href0 = df_data.iat[a,3]
r = requests.get(href0, allow_redirects=True)
f = open('%s年度报告.pdf'%rename, 'wb')
f.write(r.content)
f.close()
r.close()
#提取下载的年报的基本信息
hbcwsj = pd.DataFrame(index=range(2012,2021),columns=['营业收入','基本每股收益'])
hbsj = pd.DataFrame()
for i in range(len(hb1)):
name2 = hb1[2][i]
code = hb1['code']
dcsv = pd.read_csv(r'C:\Users\86191\Desktop\python\金融数据获取\保险业%scsv文件.csv'%name2)
dcsv['year_str'] = dcsv['year'].astype(str)
os.chdir(r'C:\Users\86187\Desktop\食品制造业10年内年度报告\%s年度报告'%name2)
#r = 0
for r in range(len(dcsv)):
year_int = dcsv.year[r]
if year_int >= 2012:
year2 = dcsv.year_str[r]
aba = name2 + year2
doc = fitz.open(r'%s年度报告.PDF'%aba)
text=''
for j in range(22):
page = doc[j]
text += page.get_text()
#p_year = re.compile('.*?(\d{4}) .*?年度报告.*?')
#year_int = int(p_year.findall(text)[0])
#设置需要匹配的四种数据的pattern
#p_rev = re.compile('(?<=\n)营业.*?收入.*?\n([\d+,.]*)\s?(?=\n)')
p_rev = re.compile('(?<=\n)营业.*?收入.*?\n([\d+,.]+).*?(?=\n)')
revenue = float(p_rev.search(text).group(1).replace(',',''))
#p_eps = re.compile('(?<=\n)基本每股收益.*?\n([-\d+,.]*)\s?(?=\n)')
#p_eps = re.compile('(?<=\n)基本每股收益.*?\n.*?\n?([-\d+,.]+)\s?(?=\n)')
p_eps = re.compile('(?<=\n)基\n?本\n?每\n?股\n?收\n?益.*?\n.*?\n?([-\d+,.]+)\s*?(?=\n)')
eps = float(p_eps.search(text).group(1))
#p_web = re.compile('(?<=\n)公司.*?网址.*?\n(.*?)(?=\n)')
p_web = re.compile('(?<=\n).*?网址.*?\n(.*?)(?=\n)')
web = p_web.search(text).group(1)
p_site = re.compile('(?<=\n).*?办公地址.*?\n(.*?)(?=\n)')
site = p_site.search(text).group(1)
hbcwsj.loc[year_int,'营业收入'] = revenue
hbcwsj.loc[year_int,'基本每股收益'] = eps
for i in range(len(df)): #循环访问每年的年报
title=df.iloc[i,3]
doc = fitz.open('./%s/%s.pdf'%(com,title))
text=''
for j in range(20): #读取每份年报前20页的数据(一般财务指标读在前20页)
page = doc[j]
text += page.get_text()
year = int(p_year.findall(text)[0])
#设置需要匹配的四种数据的pattern
p_rev = re.compile('(?<=\n)营业总?收入(?\w?)?\s?\n?([\d+,.]*)\s\n?')
p_eps = re.compile('(?<=\n)基本每股收益(元/?/?\n?股)\s?\n?([-\d+,.]*)\s?\n?')
p_site = re.compile('(?<=\n)\w*办公地址:?\s?\n?(.*?)\s?(?=\n)',re.DOTALL)
p_web =re.compile('(?<=\n)公司\w*网址:?\s?\n?([a-zA-Z./:]*)\s?(?=\n)',re.DOTALL)
revenue=float(p_rev.search(text).group(1).replace(',','')) #将匹配到的营业收入的千分位去掉并转为浮点数
pre_rev=final.loc[year-1,'营业收入(元)']
eps=p_eps.search(text).group(1)
final.loc[year,'营业收入(元)']=revenue #把营业收入和每股收益写进最开始创建的dataframe
final.loc[year,'基本每股收益(元/股)']=eps
final.to_csv('【%s】.csv' %com,encoding='utf-8-sig') #将各公司数据存储到本地测csv文件
site=p_site.search(text).group(1) #匹配办公地址和网址(由于取最近一年的,所以只要匹配一次不用循环匹配)
web=p_web.search(text).group(1)
with open('【%s】.csv'%com,'a',encoding='utf-8-sig') as f: #把股票简称,代码,办公地址和网址写入文件末尾
content='%s\n营业收入,%s\n基本每股收益,股票简称,%s\n股票代码,%s\n办公地址,%s\n公司网址,%s\n营业收入,%s\n基本每股收益,'%(name,code,site,web,revenue,eps)
f.write(content)
df = pd.read_csv('天茂集团.csv', header=None)
df.head()
x1=df[1]
a=x.tolist()
del a[0]
c=[x.replace(',','') for x in a]
d=[float(i) for i in c]
y1=df[0]
plt.xticks(rotation=45)
plt.xlabel("年份")
plt.ylabel("营业收入(元)")
plt.title("天茂集团营业收入变化趋势")
plt.xticks(range(2011,2022))
plt.plot(x1, y1, "r", marker='*', ms=10, label="天茂集团")
plt.savefig("天茂集团营业收入变化趋势.png")
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('天茂集团.csv', header=None)
df.head()
x=df[2]
a=x.tolist()
del a[0]
c=[x.replace(',','') for x in a]
d=[float(i) for i in c]
y=df[0]
plt.xticks(rotation=45)
plt.xlabel("年份")
plt.ylabel("基本每股收益(元)")
plt.title("天茂集团基本每股收益变化趋势")
plt.rcParams['font.sans-serif']=['FangSong']
plt.rcParams['axes.unicode_minus']=False
plt.xticks(range(2011,2022))
plt.plot(x, y, "r", marker='*', ms=10, label="a")
plt.savefig("天茂集团基本每股收益变化趋势.png")
plt.show()
#各上市公司营业收入比较绘图代码
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
name=['天茂集团','中国人寿','中国太保','中国平安','新华保险','西水股份','中国人保']
shouru=[]
for i in name:
df = pd.read_csv(i+'.csv', header=None)
df.head()
x=df[1]
a=x.tolist()
del a[0]
c=[x.replace(',','') for x in a]
d=[float(i) for i in c]
shouru.append(d[9])
df1= pd.read_csv(name1+'.csv', header=None)
df1.head()
x1=df1[1]
a1=x1.tolist()
del a1[0]#
c1=[x.replace(',','') for x in a1]
d1=[float(i) for i in c1]
#加入各年数据
for i in name2:
df2 = pd.read_csv(i+'.csv', header=None)
df2.head()
x2=df2[1]
a2=x2.tolist()
del a2[0]
c2=[x.replace(',','') for x in a2]
d2=[float(i) for i in c2]
shouru.append(d2[5])
plt.xlabel('年份')
plt.xticks(rotation=45)
plt.ylabel('营业收入(元)')
plt.title("保险业营业收入对比")
plt.bar(x=name4,height=shouru)
plt.savefig('保险业营业收入对比.png')
plt.show()
#各上市公司基本每股收益比较绘图代码
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
name=['天茂集团','中国人寿','中国太保','中国平安','新华保险','西水股份','中国人保']
shouyi=[]
for i in name:
df = pd.read_csv(i+'.csv', header=None)
df.head()
x=df[2
a=x.tolist()
del a[0]
c=[x.replace(',','') for x in a]
d=[float(i) for i in c]
shouyi.append(d[9]
df1= pd.read_csv(name1+'.csv', header=None)
df1.head()
x1=df1[2]
a1=x1.tolist()
del a1[0]
c1=[x.replace(',','') for x in a1]
d1=[float(i) for i in c1]
shouyi.append(d1[6])
for i in name2:
df2 = pd.read_csv(i+'.csv', header=None)
df2.head()
x2=df2[2]
a2=x2.tolist()
del a2[0]
c2=[x.replace(',','') for x in a2]
d2=[float(i) for i in c2]
shouyi.append(d2[5])
plt.xticks(rotation=45)
plt.xlabel('年份')
plt.ylabel('基本每股收益(元)')
plt.title("2021年上市公司基本每股收益比较")
plt.plot(x=name4,height=shouyi)
plt.savefig('保险业基本每股收益对比.png')
plt.show()
结果
解读
- 通过对比行业营业收入,我发现中国平安的营业收入远超过其他公司,并且增速迅速,具有活力
- 从基本每股收益的对比来看,也是中国平安的基本每股收益领先其他公司,投资者可以适当考虑中国平安的股票
- 总体营业收入趋势来看,绝大多数公司营业收入都有增长,反映出保险行业在不断发展,但是行业内有两极分化倾向
感想
这次大作业对我而言是个巨大的挑战,也遇到了很多困难,写代码真的要自己亲自动手才知道程序员的不容易,频繁的报错真的很搞人心态,得益于自己的坚持和同学老师的帮助,我才完成了此次报告。
学习永无止境,我今后还是会提高对自己的要求的