#余梓依、吴菲、钟佳文小组 # from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # browser = webdriver.Firefox() browser = webdriver.Edge() browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')#打开深证证交所的网页 # assert 'Yahoo' in browser.title element = browser.find_element(By.ID, 'input_code') # Find the search box # element.send_keys('华锦股份' + Keys.RETURN) element.send_keys('华锦股份' + Keys.RETURN) element = browser.find_element(By.LINK_TEXT, '请选择公告类别') element.click() #通过a链接里的文本查找 element=browser.find_element(By.LINK_TEXT, '年度报告') #筛选出年度报告 element.click() element = browser.find_element(By.ID, 'disclosure-table') innerHTML = element.get_attribute('innerHTML') f = open('innerHTML.html','w',encoding='utf-8') f.write(innerHTML) f.close() # html = to_pretty('innerHTML.html') browser.quit()
import re import pandas as pd class DisclosureTable(): ''' 解析深交所定期报告页搜索表格 ''' def __init__(self, innerHTML): self.html = innerHTML self.prefix = 'https://disc.szse.cn/download' self.prefix_href = 'https://www.szse.cn/' # p_a = re.compile('
(.*?)', re.DOTALL) p_span = re.compile(' (.*?)', re.DOTALL) self.get_code = lambda txt: p_a.search(txt).group(1).strip() self.get_time = lambda txt: p_span.search(txt).group(1).strip() # self.txt_to_df() def txt_to_df(self): # html table text to DataFrame html = self.html p = re.compile(' (.*?) ', re.DOTALL) trs = p.findall(html) p2 = re.compile('(.*?)', re.DOTALL) tds = [p2.findall(tr) for tr in trs[1:]] df = pd.DataFrame({'证券代码': [td[0] for td in tds], '简称': [td[1] for td in tds], '公告标题': [td[2] for td in tds], '公告时间': [td[3] for td in tds]}) self.df_txt = df def get_link(self, txt): p_txt = ' (.*?)' p = re.compile(p_txt, re.DOTALL) matchObj = p.search(txt) attachpath = matchObj.group(1).strip() href = matchObj.group(2).strip() title = matchObj.group(3).strip() return([attachpath, href, title]) def get_data(self): get_code = self.get_code get_time = self.get_time get_link = self.get_link # df = self.df_txt codes = [get_code(td) for td in df['证券代码']] short_names = [get_code(td) for td in df['简称']] ahts = [get_link(td) for td in df['公告标题']] times = [get_time(td) for td in df['公告时间']] # prefix = self.prefix prefix_href = self.prefix df = pd.DataFrame({'证券代码': codes, '简称': short_names, '公告标题': [aht[2] for aht in ahts], 'attachpath': [prefix + aht[0] for aht in ahts], 'href': [prefix_href + aht[1] for aht in ahts], '公告时间': times }) self.df_data = df return(df) f = open('innerHTML.html',encoding='utf-8') html = f.read() f.close() dt = DisclosureTable(html) df = dt.get_data() df.to_csv('data.csv')
解释都在注释里