from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
browser = webdriver.Edge() #使用Edge浏览器
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html') #输入网址
#下载数据
search = browser.find_element(By.ID, 'input_code') #定位搜索框
search.send_keys('神州信息' + Keys.RETURN) #输入股票名称
element = browser.find_element(By.ID, 'disclosure-table') #获取披露表,此处应和上述代码分开运行
innerHTML = element.get_attribute('innerHTML') #获取html代码
f = open('innerHTML.html','w',encoding='utf-8') #创建html文件
f.write(innerHTML)
f.close()
browser.quit()
#提取数据
import re
import pandas as pd
class DisclosureTable():
'''
解析深交所定期报告页搜索表格
'''
def __init__(self, innerHTML):
self.html = innerHTML
self.prefix = 'https://disc.szse.cn/download'
self.prefix_href = 'https://www.szse.cn/'
#
p_a = re.compile('<a.*?>(.*?)</a>', re.DOTALL)
p_span = re.compile('<span.*?>(.*?)</span>', re.DOTALL)
self.get_code = lambda txt: p_a.search(txt).group(1).strip()
self.get_time = lambda txt: p_span.search(txt).group(1).strip()
#
self.txt_to_df()
def txt_to_df(self):
# html table text to DataFrame
html = self.html
p = re.compile('<r>(.*?)</tr>', re.DOTALL)
trs = p.findall(html)
p2 = re.compile('<td.*?>(.*?)</td>', re.DOTALL)
tds = [p2.findall(tr) for tr in trs[1:]]
df = pd.DataFrame({'证券代码': [td[0] for td in tds],
'简称': [td[1] for td in tds],
'公告标题': [td[2] for td in tds],
'公告时间': [td[3] for td in tds]})
self.df_txt = df
def get_link(self, txt):
p_txt = '<a.*?attachpath="(.*?)".*?href="(.*?)".*?<span.*?>(.*?)</span>'
p = re.compile(p_txt, re.DOTALL)
matchObj = p.search(txt)
attachpath = matchObj.group(1).strip()
href = matchObj.group(2).strip()
title = matchObj.group(3).strip()
return([attachpath, href, title])
def get_data(self):
get_code = self.get_code
get_time = self.get_time
get_link = self.get_link
#
df = self.df_txt
codes = [get_code(td) for td in df['证券代码']]
short_names = [get_code(td) for td in df['简称']]
ahts = [get_link(td) for td in df['公告标题']]
times = [get_time(td) for td in df['公告时间']]
#
prefix = self.prefix
prefix_href = self.prefix
df = pd.DataFrame({'证券代码': codes,
'简称': short_names,
'公告标题': [aht[2] for aht in ahts],
'attachpath': [prefix + aht[0] for aht in ahts],
'href': [prefix_href + aht[1] for aht in ahts],
'公告时间': times
})
self.df_data = df
return(df)
f = open('innerHTML.html', encoding="utf-8")
html = f.read()
f.close()
dt = DisclosureTable(html)
df = dt.get_data() #提取信息
df #获得结果
df.to_csv('data.csv')
通过Selenium库自动化访问网页,定位网页源码中的搜索框,对其进行输入股票和回车操作,进入新网页后获取披露表Html,读取Html文件,通过DisclosureTable类提取数据,获得所有披露信息。