陈诺的作业三

代码


  from bs4 import BeautifulSoup
import re
import pandas as pd

f = open('002295.html',encoding='utf-8')
html = f.read()
f.close()

soup = BeautifulSoup(html)
html_prettified = soup.prettify()

f = open('002295_prettified.html','w', encoding='utf-8')
f.write(html_prettified)
f.close()


soup = BeautifulSoup(html)
html_prettified = soup.prettify()

f = open('disclosure-table_prettified.html','w', encoding='utf-8')
f.write(html_prettified)
f.close()


p = re.compile('(.*?)', re.DOTALL)
trs = p.findall(html_prettified)

p1 = re.compile('(.*?年度报告".*?)', re.DOTALL)
td1 = [p1.findall(tr) for tr in trs[1:]]
tds = [td for td in td1 if td!=[] ]

p2 = re.compile('(.*?年度报告摘要".*?)', re.DOTALL)
td2 = [p2.findall(tr) for tr in trs[1:]]
tds1 = [td for td in td2 if td!=[] ]

tds.extend(tds1)


p_link_ftitle = re.compile('(.*?)',
                                 re.DOTALL)
link_ftitles = [p_link_ftitle.findall(td[0])[0] for td in tds]

p_pub_time = re.compile('.*?finalpage/(.*?)/.*?')
p_times = [p_pub_time.search(td[0]).group(1) for td in tds]

prefix = 'https://disc.szse.cn/download'
prefix_href = 'https://www.szse.cn/'


df = pd.DataFrame({'证券代码': tds,
                   '简称': tds1,
                   '公告标题': [lf[2].strip() for lf in link_ftitles],
                   'attachpath': [lf[0].strip() for lf in link_ftitles],
                   'href': [lf[1].strip() for lf in link_ftitles],
                   '公告时间': [t.strip() for t in p_times]
    })

data = df.sort_values(by = '公告标题',ascending = False)
df.to_csv('data.csv')

结果

结果截图