袁鹏的实验报告

代码


  from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    import time
    from parse_disclosure_table import DisclosureTable
    import re
    import requests
    import pandas as pd
    import fitz
    import csv
    import matplotlib.pyplot as plt
    import matplotlib.font_manager as fm



    browser = webdriver.Edge()
    browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')

    #报告类型选择
    element = browser.find_element(By.CSS_SELECTOR,"#select_gonggao .glyphicon").click()
    element = browser.find_element(By.LINK_TEXT,"年度报告").click()

    #日期选择
    element = browser.find_element(By.CSS_SELECTOR, ".input-left").click()
    element = browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 .calendar-year span").click()
    element = browser.find_element(By.CSS_SELECTOR, ".active li:nth-child(113)").click()
    element = browser.find_element(By.LINK_TEXT, "6月").click()
    element = browser.find_element(By.CSS_SELECTOR, ".active > .dropdown-menu li:nth-child(1)").click()
    element = browser.find_element(By.CSS_SELECTOR, "#c-datepicker-menu-1 tr:nth-child(2) > .weekend:nth-child(1) > .tdcontainer").click()
    element = browser.find_element(By.CSS_SELECTOR, ".today > .tdcontainer").click()
    element = browser.find_element(By.ID, "query-btn").click()

    #下载行业分类结果PDF文件
    href = 'http://www.csrc.gov.cn/csrc/c100103/c1558619/1558619/files/1638277734844_11692.pdf'
    r = requests.get(href,allow_redirects=True)
    f = open('2021年3季度上市公司行业分类结果.pdf','wb')
    f.write(r.content)
    f.close()
    r.close()

    #获取行业分类结果PDF文件中69类行业所有上市公司

    doc = fitz.open('2021年3季度上市公司行业分类结果.pdf')
    page1 = doc[19]
    page2 = doc[20]
    toc_txt1 = page1.get_text()
    toc_txt2 = page2.get_text()

    r1 = re.compile('(?<=\医药制造业\n)(.*)(?=\n)',re.DOTALL)
    txt1 = r1.findall(toc_txt1)
    r2 = re.compile('(?<=\医药制造业\n)(.*?)(?=\罗欣药业)',re.DOTALL)
    txt2 = r2.findall(toc_txt2)

    r = re.compile('(\d{6})\s*(\w+)\s*')
    text1 = r.findall(txt1[0])
    text2 = r.findall(txt2[0])
    firm = text1 + text2


    #自动控制浏览器选择所取的公司
    for i in range(len(firm)):
        name = firm[i][1]
        code = firm[i][0]
        f = open('inner_HTML_%s.html' %name,'w',encoding='utf-8')
        element = browser.find_element(By.ID, "input_code").click()
        element = browser.find_element(By.ID,'input_code').send_keys('%s' %code)
        time.sleep(0.5)
        element = browser.find_element(By.ID, "input_code").send_keys(Keys.ENTER)
        element = browser.find_element(By.ID,'disclosure-table')
        time.sleep(0.5)
        innerHTML = element.get_attribute('innerHTML')
        f.write(innerHTML)
        time.sleep(0.5)
        f.close()
        element = browser.find_element(By.CSS_SELECTOR, ".selected-item:nth-child(2) > .icon-remove").click()
        time.sleep(0.5)

    browser.quit()

    #将获取的公司年报地址存入csv文件中
    for i in range(len(firm)):
        name = firm[i][1]
        f = open('inner_HTML_%s.html' %name,encoding='utf-8')
        t = f.read()
        f.close()

        dt = DisclosureTable(t)
        df = dt.get_data()
        df.to_csv('data_%s.csv' %name)



    #去除掉csv文件中的摘要文件链接

    lst = {}
    df5 = pd.DataFrame(columns = ['股票简称','attachpath'])
    df4 = pd.DataFrame(columns = ['股票简称'])
    for i in range(len(firm)):
        name = firm[i][1]
        df1 = pd.DataFrame(columns = ['股票简称','attachpath'])
        with open('data_%s.csv' %name,'r',newline='',encoding='utf-8') as csvfile:
            csvreader = csv.reader(csvfile)
            reader = next(csvreader)
            for row in csvreader:
                r = re.compile('.*摘要.*',re.DOTALL)
                f = r.findall(row[3])
                if f == []:
                    lst1 = {}
                    lst['股票简称'] = name
                    lst['attachpath'] = row[4]
                    lst1['股票简称'] = name
                    df1 = df1.append(lst,ignore_index=True)
                    df4 = df4.append(lst1,ignore_index=True)
                    df5 = df5.append(lst,ignore_index=True)
                df4 = df4.drop_duplicates()

        #下载获取的pdf文件
        for k in range(len(df1[df1['股票简称']=='{}'.format(name)])):
            r = requests.get(df1['attachpath'][k],allow_redirects=True)
            f = open('{0}_{1}.pdf'.format(df1['股票简称'][k],k),'wb')
            f.write(r.content)
            f.close()
            r.close()

    #提取PDF文件中“股票简称”,“股票代码”,“办公地址”,“公司网址”

    df2 = pd.DataFrame(columns=['股票简称','股票代码','办公地址','公司网址'])

    for x in range(len(df4)):
        name = df4['股票简称'][x]
        doc = fitz.open('{0}_0.pdf'.format(name))

        lst = ['股票简称','股票代码','办公地址','公司[国际互联网]*网址']
        pages = {}
        lst_text = {}
        for i in lst:
            try:
                p = re.compile(i,re.DOTALL)
                page_number = doc.page_count#获取文件页数
                #对每一页进行遍历,匹配lst中的每一个元素
                for page in range(page_number):
                    txt = doc[page].get_text()
                    match = p.findall(txt)
                    #若匹配到的macth不为空,则提取此时的页码
                    if len(match) != 0:
                        pages[i] = page
                        for k,v in pages.items():
                            text = doc[v].get_text()
                            r1 = re.compile('股票简称\s+(.+?)\n',re.DOTALL)
                            p1 = r1.findall(text)
                            lst_text['股票简称'] = p1[0]
                            r2 = re.compile('股票代码\s+(\d+)\s+',re.DOTALL)
                            p2 = r2.findall(text)
                            lst_text['股票代码'] = p2[0]
                            r3 = re.compile('办公地址\s+(.+?)\n',re.DOTALL)
                            p3 = r3.findall(text)
                            lst_text['办公地址'] = p3[0]
                            r4 = re.compile('公司[国际互联网]*网址\s+(.*?.+?)\s+',re.DOTALL)
                            p4 = r4.findall(text)
                            lst_text['公司网址'] = p4[0]

            except Exception:
                print('错误')
        df2 = df2.append(lst_text,ignore_index=True)

    #提取“主要会计数据和财务指标”中的“营业收入(元)”

    r1 = re.compile('\s营业[总]*收入(元)\s*(-?[\d,.]+)\s*',re.DOTALL)
    r2 = re.compile('\n(20[\d]{2}\s年)年度报告',re.DOTALL)
    r3 = re.compile('\s基本每股收益(元/股)\s*(-?[\d,.]+)\s*',re.DOTALL)

    for n in range(len(df4)):
        x = df4['股票简称'][n]
        data = pd.DataFrame()
        for i in range(len(df5[df5['股票简称']=='{}'.format(x)])):
            #遍历每一个PDF文件
            doc = fitz.open('{0}_{1}.pdf'.format(x,i))
            #读取报告年份
            f2 = doc[0].get_text()
            year = r2.findall(f2)
            page_num = doc.page_count
            for page in range(page_num):
                #匹配营业收入
                f1 = doc[page].get_text()
                match1 = r1.findall(f1)
                if match1 != []:
                    profit = match1[0]
                    data1 = pd.DataFrame(profit,index=[x],columns=year)
            data = pd.concat([data1,data],join='outer',axis=1)
            data.to_csv('{}——营业收入.csv'.format(x),encoding='utf-8')

    #提取“基本每股收益(元/股)”

    for n in range(len(df4)):
        x = df4['股票简称'][n]
        data = pd.DataFrame()

        for i in range(len(df5[df5['股票简称']=='{}'.format(x)])):
            #遍历每一个PDF文件
            doc = fitz.open('{0}_{1}.pdf'.format(x,i))
            #读取报告年份
            f2 = doc[0].get_text()
            year = r2.findall(f2)
            page_num = doc.page_count
            #name = df4.loc[n]
            for page in range(page_num):
                #匹配营业收入
                f1 = doc[page].get_text()
                match1 = r3.findall(f1)
                if match1 != []:
                    profit = match1[0]
                    data1 = pd.DataFrame(profit,index=[x],columns=year)
            data = pd.concat([data1,data],join='outer',axis=1)
            data.to_csv('{}——每股收益.csv'.format(x),encoding='utf-8')



#绘图
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False


df = pd.read_csv('行业信息.csv',index_col=0,dtype=(str))
df_eps = pd.read_csv('基本每股收益.csv',index_col=0)
df_revenue = pd.read_csv('营业收入.csv',index_col=0)
df_information = pd.read_csv('医药制造行业信息.csv',index_col=0,dtype=(str))
df_revenue = df_revenue/100000000

df_revenue.loc['sum'] = df_revenue.sum()
df_revenue = df_revenue.T
df_revenue = df_revenue.sort_values(by='sum',ascending=False,axis=0)
df_revenue = df_revenue.iloc[:10]

top10_list = df_revenue.index.values.tolist()
for i in range(len(top10_list)):
    top10_list[i] = top10_list[i][:-6]

df_revenue = df_revenue.T
df_revenue = df_revenue.iloc[:-1]

for name in df['上市公司简称']:
    if name not in top10_list:
        df_eps.drop(columns=[name+'基本每股收益'], axis=1, inplace=True)

for name in top10_list:
    df_revenue.rename(columns={name+'主营业务收入':name}, inplace=True)
    df_eps.rename(columns={name+'基本每股收益':name}, inplace=True)


#主营业务收入
plt.figure(figsize=(10,8))
x = df_revenue.index
y_1 = df_revenue.iloc[:,0]
y_2 = df_revenue.iloc[:,1]
y_3 = df_revenue.iloc[:,2]
y_4 = df_revenue.iloc[:,3]
y_5 = df_revenue.iloc[:,4]

plt.plot(x, y_1, marker='^', markersize=8, label=df_revenue.columns[0], linewidth=2.0)
plt.plot(x, y_2, marker='^', markersize=8, label=df_revenue.columns[1], linewidth=2.0)
plt.plot(x, y_3, marker='^', markersize=8, label=df_revenue.columns[2], linewidth=2.0)
plt.plot(x, y_4, marker='^', markersize=8, label=df_revenue.columns[3], linewidth=2.0)
plt.plot(x, y_5, marker='^', markersize=8, label=df_revenue.columns[4], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("营业收入", fontsize=16)
plt.title("营业收入随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size':15})
plt.grid()

#主营业务收入(续)
plt.figure(figsize=(10,8))
y_6 = df_revenue.iloc[:,5]
y_7 = df_revenue.iloc[:,6]
y_8 = df_revenue.iloc[:,7]
y_9 = df_revenue.iloc[:,8]
y_10 = df_revenue.iloc[:,9]

plt.plot(x, y_6, marker='^', markersize=8, label=df_revenue.columns[5], linewidth=2.0)
plt.plot(x, y_7, marker='^', markersize=8, label=df_revenue.columns[6], linewidth=2.0)
plt.plot(x, y_8, marker='^', markersize=8, label=df_revenue.columns[7], linewidth=2.0)
plt.plot(x, y_9, marker='^', markersize=8, label=df_revenue.columns[8], linewidth=2.0)
plt.plot(x, y_10, marker='^', markersize=8, label=df_revenue.columns[9], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("营业收入", fontsize=16)
plt.title("营业收入随时间变化趋势图(续)", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()



#基本每股收益
plt.figure(figsize=(10,8))
x = df_eps.index
y_1 = df_eps.iloc[:,4]
y_2 = df_eps.iloc[:,8]
y_3 = df_eps.iloc[:,5]
y_4 = df_eps.iloc[:,6]
y_5 = df_eps.iloc[:,1]

plt.plot(x, y_1, marker='s', markersize=7, label=df_eps.columns[4], linewidth=2.0)
plt.plot(x, y_2, marker='s', markersize=7, label=df_eps.columns[8], linewidth=2.0)
plt.plot(x, y_3, marker='s', markersize=7, label=df_eps.columns[5], linewidth=2.0)
plt.plot(x, y_4, marker='s', markersize=7, label=df_eps.columns[6], linewidth=2.0)
plt.plot(x, y_5, marker='s', markersize=7, label=df_eps.columns[1], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益", fontsize=16)
plt.title("基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=0, prop={'size':15})
plt.grid()

#基本每股收益(续)
plt.figure(figsize=(10,8))
y_6 = df_eps.iloc[:,9]
y_7 = df_eps.iloc[:,7]
y_8 = df_eps.iloc[:,3]
y_9 = df_eps.iloc[:,2]
y_10 = df_eps.iloc[:,0]

plt.plot(x, y_6, marker='s', markersize=7, label=df_eps.columns[9], linewidth=2.0)
plt.plot(x, y_7, marker='s', markersize=7, label=df_eps.columns[7], linewidth=2.0)
plt.plot(x, y_8, marker='s', markersize=7, label=df_eps.columns[3], linewidth=2.0)
plt.plot(x, y_9, marker='s', markersize=7, label=df_eps.columns[2], linewidth=2.0)
plt.plot(x, y_10, marker='s', markersize=7, label=df_eps.columns[0], linewidth=2.0)

plt.xticks(range(2012,2022), fontsize=16)
plt.xlabel("年份", fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel("基本每股收益(元/股)", fontsize=16)
plt.title("基本每股收益随时间变化趋势图", fontsize=16)
plt.legend(loc=1, prop={'size': 15})
plt.grid()



df_revenue[:5].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('营业收入', fontsize=16)
plt.title('营业收入对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

df_revenue[5:].plot(kind='bar', figsize=(10,8), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('主营业务收入', fontsize=16)
plt.title('营业收入对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

#2012-2016基本每股收益对比
df_eps.iloc[:5,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益', fontsize=16)
plt.title('基本每股收益对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14})
plt.grid()

#2017-2021基本每股收益对比
df_eps.iloc[5:,[4,8,5,6,1,9,7,3,2,0]].plot(kind='bar', figsize=(18,9), width=0.6)
plt.xticks(fontsize=16, rotation=0)
plt.xlabel('年份', fontsize=16,rotation=0)
plt.yticks(fontsize=16)
plt.ylabel('基本每股收益(元/股)', fontsize=16)
plt.title('基本每股收益对比图', fontsize=16)
plt.legend(loc=1, prop={'size':14}, ncol=2)
plt.grid()


结果

结果截图 结果截图 结果截图 结果截图 结果截图 结果截图

解释

从所选公司的营业收入趋势来看,虽然中间有着少许起伏波动,但医药制造业的总体发展趋势是向好的,其基本每股收益走向平缓,也说明整个行业受外界影响还是较小的