欧阳子轩的实验报告

代码


  import pandas as pd
  import matplotlib.pyplot as plt
  import matplotlib as mpl
  mpl.rcParams['font.sans-serif'] = ['FangSong']
  mpl.rcParams['axes.unicode_minus'] = False
  import re
  import requests
  import pandas as pd
  import time
  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.common.keys import Keys
  from selenium.webdriver.support.wait import WebDriverWait
  from bs4 import BeautifulSoup


  browser = webdriver.Edge()
  browser.get('http://www.szse.cn/disclosure/listed/fixed/index.html')
  element = browser.find_element(By.ID, 'input_code')
  element.send_keys('深粮控股' + Keys.RETURN)
  baogao=browser.find_element(By.CSS_SELECTOR,'#select_gonggao > div > div > a > span.c-selectex-btn-text')
  nianbao=browser.find_element(By.CSS_SELECTOR,'#c-selectex-menus-3 > li:nth-child(1) > a')
  baogao.click()
  nianbao.click()
  begin.send_keys('2013-01-01'+Keys.RETURN)
  end.send_keys('2022-12-31'+Keys.RETURN)
  time.sleep(2)

  trs=[]

  for r in range(5):
        time.sleep(2)

        element = browser.find_element(By.ID, 'disclosure-table')
        innerHTML = element.get_attribute('innerHTML')

        f = open('innerHTML.html','w',encoding='utf-8')
        f.write(innerHTML)
        f.close()

        f = open('innerHTML.html',encoding='utf-8')
        html = f.read()
        f.close()
        soup = BeautifulSoup(html,features='lxml')
        html_prettified = soup.prettify()
        p = re.compile('(.*?)', re.DOTALL)
        tr = p.findall(html_prettified)
        trs.extend(tr)

        flag = element_exist(browser,'下一页')
        if flag:
            nextpage = browser.find_element(By.PARTIAL_LINK_TEXT,'下一页')
            nextpage.click()
            wait = WebDriverWait(browser, 2)
        else:
            break


  prefix = 'https://disc.szse.cn/download'
  prefix_href = 'http://www.szse.cn'
  p2 = re.compile('(.*?)', re.DOTALL)
  tds = [p2.findall(tr) for tr in trs[1:]]


  tds = list(filter(None,tds))

  p_code = re.compile('(.*?)', re.DOTALL)
  codes = [p_code.search(td[0]).group(1).strip() for td in tds]


  p_shortname = p_code
  short_names = [p_shortname.search(td[1]).group(1).strip() for td in tds]
  p_link_ftitle = re.compile('(.*?)',
                               re.DOTALL)
  link_ftitles = [p_link_ftitle.findall(td[2])[0] for td in tds]
  p_pub_time = re.compile('(.*?)', re.DOTALL)
  p_times = [p_pub_time.search(td[3]).group(1) for td in tds]
  df = pd.DataFrame({'证券代码': codes,
                     '简称': short_names,
                     '公告标题': [lf[2].strip() for lf in link_ftitles],
                     'attachpath': [prefix+lf[0].strip() for lf in link_ftitles],
                     'href': [prefix_href+lf[1].strip() for lf in link_ftitles],
                     '公告时间': [t.strip() for t in p_times]
        })


  df.to_csv('data.csv')
  browser.quit()
  title=[lf[2].strip() for lf in link_ftitles
  url = [prefix+lf[0].strip() for lf in link_ftitles]
  for i in range(0,20):
      href = url[i]
      r = requests.get(href, allow_redirects=True)
      f = open('深粮控股'+title[i]+'.pdf', 'wb')
      f.write(r.content)
      f.close()
      r.close()
      time.sleep(2)



  browser = webdriver.Chrome()
  browser.get('http://www.sse.com.cn/disclosure/listedinfo/announcement/')
  element = browser.find_element(By.ID, 'inputCode')
  time.sleep(2)
  element.send_keys('600058' + Keys.RETURN)

  time.sleep(2)
  baogao=browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content > div
     > div.col-lg-3.col-xxl-2 > div.search_inputCol > div.sse_outerItem.js_keyWords > div.sse_searchInput > input')

  baogao.send_keys('年度报告'+Keys.RETURN)
  time.sleep(2)
  nianbao=browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content > div >
    div.col-lg-3.col-xxl-2 > div.search_inputCol > div.js_typeListUl > div.announceTypeList > div.announceDiv.
    announce-child > ul > li:nth-child(1)')#定位到年报
  nianbao.click()
  time.sleep(1)#

  browser.find_element(By.CSS_SELECTOR, ".range_date").click()
  browser.find_element(By.CSS_SELECTOR, ".laydate-main-list-0 span:nth-child(1)").click()
  browser.find_element(By.CSS_SELECTOR, ".layui-laydate-list > li:nth-child(5)").click()
  browser.find_element(By.CSS_SELECTOR, ".laydate-main-list-1 span:nth-child(1)").click()
  browser.find_element(By.CSS_SELECTOR, ".layui-laydate-list > li:nth-child(5)").click()
  browser.find_element(By.CSS_SELECTOR, ".laydate-btns-confirm").click()
  time.sleep(2)
  element = browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content > div >
     div.col-lg-9.col-xxl-10 > div.list_tableCol > div > div.table-responsive > table' )  # ID定位到年报表格
  innerHTML = element.get_attribute('innerHTML')
  f = open('innerHTML.html','w',encoding='utf-8')
  f.write(innerHTML)
  f.close()
  f = open('innerHTML.html',encoding='utf-8')
  html = f.read()
  f.close()
  trs=[]
  soup = BeautifulSoup(html,features='lxml')
  html_prettified = soup.prettify()
  p = re.compile('(.*?)', re.DOTALL)
  tr = p.findall(html_prettified)
  trs.extend(tr)

  prefix = 'https:'
  p2 = re.compile('(.*?)', re.DOTALL)
  tds = [p2.findall(tr) for tr in trs[1:]]

  tds = list(filter(None,tds))
  p_link = re.compile('(.*?)', re.DOTALL)
  title=[p_link.search(td[2]).group(2).strip() for td in tds]
  link_ftitles = [p_link.search(td[2]).group(1).strip() for td in tds]
  codes=tds[0][0]
  name=tds[0][1].strip()


  p_times = [td[4] for td in tds]


  df = pd.DataFrame({'证券代码': codes,
                       '简称': name,
                       '公告标题': [t for t in title],
                       'attachpath': [prefix+lf for lf in link_ftitles],

                       '公告时间': [t.strip() for t in p_times]
        })


  df.to_csv('data.csv')

  attachpath= [prefix+lf for lf in link_ftitles]
  biaoti=[t for t in title]
  for i in range(0,10):
    href = attachpath[i]
    r = requests.get(href, allow_redirects=True)
    f = open(biaoti[i]+'.pdf', 'wb')
    f.write(r.content)
    f.close()
    r.close()
    time.sleep(2)



  import re
  import requests
  import pandas as pd
  import time
  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.common.keys import Keys
  from bs4 import BeautifulSoup
  from selenium.webdriver.support.wait import WebDriverWait

  browser = webdriver.Chrome()
  browser.get('http://www.sse.com.cn/disclosure/listedinfo/announcement/')
  element = browser.find_element(By.ID, 'inputCode')
  time.sleep(2)
  element.send_keys('600058' + Keys.RETURN)

  time.sleep(2)
  baogao=browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content >
        div > div.col-lg-3.col-xxl-2 > div.search_inputCol > div.sse_outerItem.js_keyWords > div.sse_searchInput > input')

  baogao.send_keys('年度报告'+Keys.RETURN)
  time.sleep(2)
  nianbao=browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content > div
      > div.col-lg-3.col-xxl-2 > div.search_inputCol > div.js_typeListUl > div.announceTypeList > div.announceDiv.
      announce-child > ul > li:nth-child(1)')
  nianbao.click()
  time.sleep(1)

  shijian=browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div:nth-child(1) > div >
       div.announce_condition > div.announce_todayCon.d-flex.align-items-center.justify-content-between >
       div.today_leftDate > span.range_date.js_laydateSearch').click()
  time.sleep(1)
  button=browser.find_element_by_css_selector('#layui-laydate1 > div.layui-laydate-footer > div > span').click()

  time.sleep(2)
  element = browser.find_element(By.CSS_SELECTOR,'body > div.announcement_con > div.container.sse_content > div >
        div.col-lg-9.col-xxl-10 > div.list_tableCol > div > div.table-responsive > table' )
  innerHTML = element.get_attribute('innerHTML')

  f = open('innerHTML.html','w',encoding='utf-8')
  f.write(innerHTML)
  f.close()
  f = open('innerHTML.html',encoding='utf-8')
  html = f.read()
  f.close()
  trs=[]
  soup = BeautifulSoup(html,features='lxml')
  html_prettified = soup.prettify()
  p = re.compile('(.*?)', re.DOTALL)
  tr = p.findall(html_prettified)
  trs.extend(tr)

  prefix = 'https:'
  p2 = re.compile('(.*?)', re.DOTALL)
  tds = [p2.findall(tr) for tr in trs[1:]]

  tds = list(filter(None,tds))
  p_link = re.compile('(.*?)', re.DOTALL)

  title=[p_link.search(td[2]).group(2).strip() for td in tds]
  link_ftitles = [p_link.search(td[2]).group(1).strip() for td in tds]
  codes=tds[0][0]
  name=tds[0][1].strip()


  p_times = [td[4] for td in tds]

  df = pd.DataFrame({'证券代码': codes,
                       '简称': name,
                       '公告标题': [t for t in title],
                       'attachpath': [prefix+lf for lf in link_ftitles],

                       '公告时间': [t.strip() for t in p_times]
        })


  df.to_csv('data.csv')
  time.sleep(1)

  attachpath= [prefix+lf for lf in link_ftitles]
  biaoti=[t for t in title]
  for i in range(0,6):
      href = attachpath[i]
      r = requests.get(href, allow_redirects=True)
      f = open(biaoti[i]+'.pdf', 'wb')
      f.write(r.content)
      f.close()
      r.close()
    time.sleep(2)



  import fitz
  import re
  import pandas as pd
  shouru=[]
  shouyi=[]
  name=['gongsinianbao']
    class NB():


         def __init__(self,pdf_filename):
            self.doc = fitz.open(pdf_filename)
            self.pdf_name = pdf_filename
            self.get_toc()
            self.jie_pages_title()
            self.get_key_findata_pages()
            self.get_target_page()

        def get_toc(self):
            jie_zh = '一二三四五六七八九十'
            p = re.compile('(第[%s]{1,2}节)\s+(\w[、\w]*\w)' % jie_zh)
            toc = []
            for page in self.doc:
                txt = page.get_text()
                match = p.findall(txt)
                if len(match) != 0:
                    first_match = match[0]
                    toc.append((first_match, page.number))
            #
            self.toc = toc

        def jie_pages_title(self):
            toc = self.toc
            jie_pages = {}
            jie_title = {}
            for t in toc:
                jie, title, pageNumber = t[0][0], t[0][1], t[1]
                if jie in jie_pages:
                    jie_pages[jie].append(pageNumber)
                else:
                    jie_pages[jie] = [pageNumber]
                    jie_title[jie] = title
            self.jie_pages = jie_pages
            self.jie_title = jie_title

        def get_key_findata_pages(self):
            pages = ''
            jie_title = self.jie_title
            titles = ['公司简介和主要财务指标', '会计数据和财务指标摘要']
            for jie in jie_title:
                title = jie_title[jie]
                if title in titles: pages = self.jie_pages[jie]; break
            if pages == '':
                Warning('没有找到“公司简介和主要财务指标”或“会计数据和财务指标摘要”')
            #
            self.key_fin_data_pages = pages
            return(pages)

        def get_target_page(self):
            pages = self.key_fin_data_pages
            pattern = re.compile('主要会计数据和财务指标.*?营业收入', re.DOTALL)
            target_page = ''
            for p in pages:
                page = self.doc[p]
                txt = page.get_text()
                matchObj = pattern.search(txt)
                if matchObj is not None:
                    target_page = p
                    break
            if target_page == '':
                Warning('没找到“主要会计数据和财务指标”页')
            self.key_fin_data_page = target_page
            return(target_page)

  doc = fitz.open(name[9])
  sf2021 = NB(name[9])
  p1=re.compile(r"(?<=\n)(营业收入（元）.*?)(?=\n归属于上市公司股东的净利润)",re.DOTALL)
  x1=doc[5].get_text()
  x2=doc[6].get_text()
  x3=doc[7].get_text()
  t=x1+x2+x3
  r=p1.findall(t)
  a=r[0].split('\n')

  p2=re.compile(r"(?<=\n)(基本每股收益（元/股）.*?)(?=\n加权平均净资产收益率)",re.DOTALL)
  r2=p2.findall(t)
  while r2==[]:
         try:
             p3=re.compile(r"(?<=\n)(基本每股收益（元/股）.*?)(?=\n稀释每股收益（元/股）)",re.DOTALL)
             r2=p3.findall(t)
         except Exception:
             pass
  b=r2[0].split('\n')
  shouru.append(a[1])
  shouyi.append(b[1])
  p4=re.compile(r"\n股票简称 \n(.*?)股票代码(.*?)\n股票上市证券交易所.*?办公地址(.*?)\n办公地址的邮政编码 .*?
        公司网址(.*?)\n电子信箱",re.DOTALL)#提取上市公司信息
  info=p4.findall(t)
  d=pd.DataFrame({'股票简称':[t[0] for t in info],
                    '股票代码':[t[1] for t in info],
                    '办公地址':[t[2] for t in info],
                    '公司网址':[t[3] for t in info]})
   f=pd.DataFrame({'营业收入':[t for t in shouru],
                     '基本每股收益':[s for s in shouyi]},index=['2012年','2013年','2014年','2015年',
                     '2016年','2017年','2018年','2019年','2020年','2021年'])
    #将提取的数据存入csv文件
  df.to_csv('公司财务.csv')
  d.to_csv('公司信息.csv')



  import pandas as pd
  import matplotlib.pyplot as plt
  import matplotlib as mpl
  mpl.rcParams['font.sans-serif'] = ['FangSong']
  mpl.rcParams['axes.unicode_minus'] = False


  i=['神州数码','远大控股','厦门信达','浙商中拓','爱施德','五矿发展','建发股份','物产中大','厦门国贸','九州通']


  for i in i:
    df=pd.read_excel("C:/Users/13644/Desktop/{}.xlsx".format(i))
    fig=plt.figure(figsize=(20,8),dpi=80)
    ax=fig.add_subplot(111)
    lin1=ax.plot(df.index,df["营业收入"],marker="o",label="营业收入")
    ax.set_title('{}公司2012——2021营业收入及基本每股收益'.format(i),size=20)
    ax.set_xlabel("时间",size=18)
    ax.set_ylabel("营业收入",size=18)
    for i,j in df["营业收入"].items():
      ax.text(i,j+20,str(j),va="bottom",ha="center",size=15)

    ax1=ax.twinx()
    lin2=ax1.plot(df.index,df["基本每股收益"],marker="o",color="red",label="基本每股收益")
    ax1.set_ylabel("基本每股收益)",size=18)

    lins=lin1+lin2
    labs=[l.get_label() for l in lins]
    ax.legend(lins,labs,loc="upper left",fontsize=15)
    plt.show()

  df=pd.read_excel("C:/Users/13644/Desktop/1.xlsx",header=0,index_col=0)
  for i in range(10):
    a=i+12
    plt.figure(figsize=(12,10))
    plt.xlabel('年份')
    plt.ylabel('基本每股收益')
    plt.title("20{}年上市公司基本每股收益比较".format(a))
    plt.bar(x=df.columns,height=df.iloc[i],width=0.5)

  df=pd.read_excel("C:/Users/13644/Desktop/2.xlsx",header=0,index_col=0)
  for i in range(10):
    a=i+12
    plt.figure(figsize=(12,10))
    plt.xlabel('年份')
    plt.ylabel('营业收入/元')
    plt.title("20{}年上市公司营业收入比较".format(a))
    plt.bar(x=df.columns,height=df.iloc[i],width=0.5)
结果

解释

在这次大作业中，我解析的行业为行业分类51的批发业，这个行业的上市公司一共有81家，所以数量上还是挺多，而且在解析信息时，我发现这些公司被分为批发业在我心中还是有点疑惑的，因为这些公司有医疗，有交通等等，所以我还是不太能知道，为什么能这样分类。在最后的行业数据解析时，我发现一些典型的批发业公司在2015到 2019年的数据都挺惨淡的，原因我分析还是因为电商平台的大力发展，还是对这些公司产生了比较大的影响，但能生存下来的公司在最近几年都扭转了颓势，数据都在上升。最后还是总结一下吧，这门课真实我大学期间选的最难的一门了，因为我在之前有过一点点python基础，所以选这门课的时候还是挺自信的，尤其是在前几门课，我感觉应该会很轻松地拿分，但到了正则表达式这里，就开始难起来了，这门课不仅对一些库的使用方法需要比较熟练的掌握，最关键的还是要对python这门语言有更深的理解，不管是循环语法的使用，还是一些细节的处理，在这门课中，我真的提升了许多，还是很谢谢吴老师这学期细心的教导，我也相信这门课学到的东西能伴随我一生。