from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys service = Service(executable_path='C:\\Users\\Lenovo\\Downloads\\edgedriver_win64\\MicrosoftEdge.exe') driver=webdriver.Edge(service=service) driver.get('https://www.szse.cn/disclosure/listed/fixed/index.html')#打开深交所网页 search = driver.find_element(By.ID, 'input_code') search.send_keys('周大生' + Keys.RETURN) element = driver.find_element(By.ID, 'disclosure-table') #获取披露表,此处应和上述代码分开运行 innerHTML = element.get_attribute('innerHTML') f = open('innerHTML.html','w',encoding='utf-8') f.write(innerHTML) f.close() driver.quit() import re import pandas as pd class DisclosureTable(): ''' 解析深交所定期报告页搜索表格 ''' def __init__(self, innerHTML): self.html = innerHTML self.prefix = 'https://disc.szse.cn/download' self.prefix_href = 'https://www.szse.cn/' # p_a = re.compile('(.*?)', re.DOTALL) p_span = re.compile(' (.*?)', re.DOTALL) self.get_code = lambda txt: p_a.search(txt).group(1).strip() self.get_time = lambda txt: p_span.search(txt).group(1).strip() # self.txt_to_df() def txt_to_df(self): # html table text to DataFrame html = self.html p = re.compile('(.*?)', re.DOTALL) trs = p.findall(html) p2 = re.compile(' (.*?)', re.DOTALL) tds = [p2.findall(tr) for tr in trs[1:]] df = pd.DataFrame({'证券代码': [td[0] for td in tds], '简称': [td[1] for td in tds], '公告标题': [td[2] for td in tds], '公告时间': [td[3] for td in tds]}) self.df_txt = df def get_link(self, txt): p_txt = ' (.*?)' p = re.compile(p_txt, re.DOTALL) matchObj = p.search(txt) attachpath = matchObj.group(1).strip() href = matchObj.group(2).strip() title = matchObj.group(3).strip() return ([attachpath, href, title]) def get_data(self): get_code = self.get_code get_time = self.get_time get_link = self.get_link # df = self.df_txt codes = [get_code(td) for td in df['证券代码']] short_names = [get_code(td) for td in df['简称']] ahts = [get_link(td) for td in df['公告标题']] times = [get_time(td) for td in df['公告时间']] # prefix = self.prefix prefix_href = self.prefix df = pd.DataFrame({'证券代码': codes, '简称': short_names, '公告标题': [aht[2] for aht in ahts], 'attachpath': [prefix + aht[0] for aht in ahts], 'href': [prefix_href + aht[1] for aht in ahts], '公告时间': times }) self.df_data = df return (df) f = open('innerHTML.html', encoding="utf-8") html = f.read() f.close() dt = DisclosureTable(html) df = dt.get_data() #提取信息 #获得结果 df.to_csv('周大生.csv')
环境变量因为版本更新,忘了怎么设,就用了绝对路径;提取信息向同学学了个模块