李禹含的作业二

代码


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

browser = webdriver.Edge()
browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
element = browser.find_element(By.ID, 'input_code')  # Find the search box
element.send_keys('海晨股份' + Keys.RETURN)


element = browser.find_element(By.ID, 'disclosure-table')  #通过ID找
innerHTML = element.get_attribute('innerHTML')


f = open('innerHTML.html','w',encoding='utf-8')
f.write(innerHTML)
f.close()

browser.quit()


from bs4 import BeautifulSoup
import re
import pandas as pd


def to_pretty(fhtml):
    f = open(fhtml,encoding='utf-8')
    html = f.read()
    f.close()

    soup = BeautifulSoup(html)
    html_prettified = soup.prettify()

    f = open(fhtml[0:-5]+'-prettified.html', 'w', encoding='utf-8')
    f.write(html_prettified)
    f.close()
    return(html_prettified)


html = to_pretty('innerHTML.html')

def txt_to_df(html):
    # html table text to DataFrame
    p = re.compile('(.*?)', re.DOTALL)
    trs = p.findall(html)

    p2 = re.compile('(.*?)', re.DOTALL)
    tds = [p2.findall(tr) for tr in trs[1:]]

    df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                       '简称': [td[1] for td in tds],
                       '公告标题': [td[2] for td in tds],
                       '公告时间': [td[3] for td in tds]})
    return(df)

df_txt = txt_to_df(html)


p_a = re.compile('(.*?)', re.DOTALL)
p_span = re.compile('(.*?)', re.DOTALL)

get_code = lambda txt: p_a.search(txt).group(1).strip()
get_time = lambda txt: p_span.search(txt).group(1).strip()

def get_link(txt):
    p_txt = '(.*?)'
    p = re.compile(p_txt, re.DOTALL)
    matchObj = p.search(txt)
    attachpath = matchObj.group(1).strip()
    href       = matchObj.group(2).strip()
    title      = matchObj.group(3).strip()
    return([attachpath, href, title])

def get_data(df_txt):
    prefix = 'https://disc.szse.cn/download'
    prefix_href = 'https://www.szse.cn/'
    df = df_txt
    codes = [get_code(td) for td in df['证券代码']]
    short_names = [get_code(td) for td in df['简称']]
    ahts = [get_link(td) for td in df['公告标题']]
    times = [get_time(td) for td in df['公告时间']]
    df = pd.DataFrame({'证券代码': codes,
                       '简称': short_names,
                       '公告标题': [aht[2] for aht in ahts],
                       'attachpath': [prefix + aht[0] for aht in ahts],
                       'href': [prefix_href + aht[1] for aht in ahts],
                       '公告时间': times
        })
    return(df)

df_data = get_data(df_txt)

df_data.to_csv('sample_data.csv')
结果

解释

1.导入相关模块 2.打开浏览器找到目标股票公告结果 3.找到表格行的开始和结束标签之间的HTML，建立一个html文件，写入innerHTML的内容 4.将目标内容读入html，并将结果导出为csv文件