# 对包含年报信息的html文件进行处理
# 提取证券代码、证券简称、年报下载链接、年报名称以及年报发布时间
# 对dataframe进行信息筛选
import re
def html_info_sh(tr):
'''
解析从上交所爬取的年报html
提取证券代码、证券简称、年报下载链接、年报名称以及年报发布时间
'''
p_td = re.compile('<td.*?>(.*?)</td>',re.DOTALL)
tds = p_td.findall(tr)
s = tds[0].find('>')+1
e = tds[0].rfind('<')
code = tds[0][s:e]
s = tds[1].find(">")+1
e = tds[1].rfind('<')
name = tds[1][s:e]
s = tds[2].find('href="')+6
e = tds[2].find('.pdf"')+4
href = 'http://www.sse.com.cn'+tds[2][s:e]
s = tds[2].find('$(this))">')+10
e = tds[2].find('</span>')
title = tds[2][s:e]
date = tds[3].strip()
data = [code,name,href,title,date]
return(data)
def html_info_sz(tr):
'''
解析从深交所爬取的年报html
提取证券代码、证券简称、年报下载链接、年报名称以及年报发布时间
'''
# .*? 表示匹配任意数量的重复,但是在能使整个匹配成功的前提下使用最少的重复
p_td = re.compile('<td.*?>(.*?)</td>',re.DOTALL)
tds = p_td.findall(tr)
s = tds[0].find('>')+1
e = tds[0].rfind('<')
code = tds[0][s:e]
s = tds[1].find(">")+1
e = tds[1].rfind('<')
name = tds[1][s:e]
s = tds[2].find('attachpath="')+12
e = tds[2].find('.PDF"')+4
href = 'https://disc.szse.cn/download'+tds[2][s:e]
p_title = re.compile('<span.*?>(.*?)</span>',re.DOTALL)
title = p_title.findall(tds[2])[0]
s = tds[3].find(">")+1
e = tds[3].rfind('<')
date = tds[3][s:e]
data = [code,name,href,title,date]
return(data)
def filter(df,col,word):
'''
剔除df数据框中col列中包含word所在行
'''
condition = df[col].to_frame().apply(lambda x:word in x[col],axis = 1)
result = df[~condition]
return result
def filter_mul(df,col,words):
'''
剔除df数据框中col列中包含words所在行
'''
result = df.copy()
for word in words:
result = filter(result,col,word)
return result