陈彤的作业二

代码


  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.common.keys import Keys
  from bs4 import BeautifulSoup
  import re
  import pandas as pd

  browser = webdriver.Chrome()
  browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')


  element = browser.find_element(By.ID, 'input_code')
  element.send_keys('金螳螂' + Keys.RETURN)

  browser.find_element(By.CSS_SELECTOR, "#select_hangye .glyphicon").click()
  browser.find_element(By.LINK_TEXT, "建筑业").click()
  browser.find_element(By.CSS_SELECTOR, "#select_gonggao .c-selectex-btn-text").click()
  browser.find_element(By.LINK_TEXT, "年度报告").click()
  browser.find_element(By.LINK_TEXT, "2").click()

  element1 = browser.find_element(By.ID, 'disclosure-table')
  innerHTML = element1.get_attribute('innerHTML')

  f = open('innerHTML.html','w',encoding='utf-8')
  f.write(innerHTML)
  f.close()


  def to_pretty(fhtml):
      f = open(fhtml,encoding='utf-8')
      html = f.read()
      f.close()

      soup = BeautifulSoup(html)
      html_prettified = soup.prettify()

      f = open(fhtml[0:-5]+'-prettified.html', 'w', encoding='utf-8')
      f.write(html_prettified)
      f.close()
      return(html_prettified)

  html = to_pretty('innerHTML.html')

  def txt_to_df(html):
      # html table text to DataFrame
      p = re.compile('(.*?)', re.DOTALL)
      trs = p.findall(html)

      p2 = re.compile('(.*?)', re.DOTALL)
      tds = [p2.findall(tr) for tr in trs[1:]]

      df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                         '简称': [td[1] for td in tds],
                         '公告标题': [td[2] for td in tds],
                         '公告时间': [td[3] for td in tds]})
      return(df)

  df_txt = txt_to_df(html)


  p_a = re.compile('(.*?)', re.DOTALL)
  p_span = re.compile('(.*?)', re.DOTALL)

  get_code = lambda txt: p_a.search(txt).group(1).strip()
  get_time = lambda txt: p_span.search(txt).group(1).strip()

  def get_link(txt):
      p_txt = '(.*?)'
      p = re.compile(p_txt, re.DOTALL)
      matchObj = p.search(txt)
      attachpath = matchObj.group(1).strip()
      href       = matchObj.group(2).strip()
      title      = matchObj.group(3).strip()
      return([attachpath, href, title])

  def get_data(df_txt):
      prefix = 'http://disc.szse.cn/download'
      prefix_href = 'http://www.szse.cn/'
      df = df_txt
      codes = [get_code(td) for td in df['证券代码']]
      short_names = [get_code(td) for td in df['简称']]
      ahts = [get_link(td) for td in df['公告标题']]
      times = [get_time(td) for td in df['公告时间']]
      #
      df = pd.DataFrame({'证券代码': codes,
                         '简称': short_names,
                         '公告标题': [aht[2] for aht in ahts],
                         'attachpath': [prefix + aht[0] for aht in ahts],
                         'href': [prefix_href + aht[1] for aht in ahts],
                         '公告时间': times
          })
      return(df)

  df_data = get_data(df_txt)

  df_data.to_csv('sample_data_from_002081.csv')

结果

解释

首先对安装的selenium获取深交所定期报告界面的网页，利用定位搜索框，同时输入公司名称“金螳螂”并使用send_keys进行回车键的执行。此外对可下拉选项框的定位进行“建筑业”和“年度报告”的点击（click）执行。此外对翻页进行点击执行。通过网页检查查找到网页源代码并定位到所需位置，利用get_attribute函数爬取所需代码之后写入“innerHTML.html”文件。定义to_pretty函数，利用第三方模块BeautifulSoup对爬取的代码进行简化，简化后放入html字符串内。定义函数，使用正则表达式提取出 '证券代码'，'简称'，'公告标题'，'公告时间'的内容并且使用pandas第三方模块将内容放入df，将最后的结果放入df_data并输出为“sample_data_from_002081.csv”。