In [2]:
# 4201369  王庆
import pandas as pd
import openpyxl
xlsx = '半导体行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1=links[1:-1]
links_2=''.join(links_1)
p=re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(links_2)

df2=pd.DataFrame({'Link':[t[0]for t in list_of_tuple],
                 'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('半导体行业.csv')
import re 
import requests
import time
import pandas as pd

df=pd.read_csv('半导体行业.csv')

p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub('年年度报告',f)for f in df.f_name]
df['f_name']=f_names; del p,f_names
print(df)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-e681f1f6a17a> in <module>
      4 xlsx = '半导体行业.xlsx'
      5 
----> 6 df = pd.read_excel(xlsx)
      7 
      8 exf = openpyxl.load_workbook(xlsx)

F:\2345Downloads\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    297                 )
    298                 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 299             return func(*args, **kwargs)
    300 
    301         return wrapper

F:\2345Downloads\anaconda3\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
    334     if not isinstance(io, ExcelFile):
    335         should_close = True
--> 336         io = ExcelFile(io, storage_options=storage_options, engine=engine)
    337     elif engine and engine != io.engine:
    338         raise ValueError(

F:\2345Downloads\anaconda3\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options)
   1069                 ext = "xls"
   1070             else:
-> 1071                 ext = inspect_excel_format(
   1072                     content=path_or_buffer, storage_options=storage_options
   1073                 )

F:\2345Downloads\anaconda3\lib\site-packages\pandas\io\excel\_base.py in inspect_excel_format(path, content, storage_options)
    947     assert content_or_path is not None
    948 
--> 949     with get_handle(
    950         content_or_path, "rb", storage_options=storage_options, is_text=False
    951     ) as handle:

F:\2345Downloads\anaconda3\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    649         else:
    650             # Binary mode
--> 651             handle = open(handle, ioargs.mode)
    652         handles.append(handle)
    653 

FileNotFoundError: [Errno 2] No such file or directory: '半导体行业.xlsx'
In [3]:
def filter_links(words,df,include=True):
    ls=[]
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag=flag or ls[c][r]
            else:
                flag=flag and ls[c][r]
        index.append(flag)
    df2=df[index]
    return(df2)   
In [ ]:
df_all=filter_links(['摘要','审计报告','财务','英文版'],df,include=False) 
df_original=filter_links(['(','('],df_all,include=False)
df_db=filter_links(['寒武纪','北方华创','中金科技','阿石创','澜起科技','富瀚微','博通集成','华微电子','雏鹰农牧','牧原股份'],df_original,include=True)
links=df_db['Link'];f_names=df_db['f_name']
In [ ]:
def get_PDF_url(url):
    r=requests.get(url);r.encoding='utf-8';html=r.text
    r.close()
    p=re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
    a=p.search(html)
    if a is None:
        Warning('没有找到下载链接,请手动检查链接:%s' % url)
        return()
    else:
        href=a.group(1);fname=a.group(2).strip()
    href=r.url[:26]+href
    return((href,fname))
    
for link in links:
    href,fname=get_PDF_url(link)
    r=requests.get(href,allow_redirects=True)
    open('%s'%fname,'wb').write(r.content)
    time.sleep(10)
r.close()
In [ ]:
import pdfplumber
import os

filenames = os.listdir()
print(filenames)
In [ ]:
def getText(pdf):
    text = ''
    doc = fitz.open(pdf)
    for page in doc:
        text += page.getText()
    doc.close()
    return(text)
    
def parse_data_line(pdf):    
    text = getText(pdf)
    p1 = re.compile('\w{1,2}、主要会计数据和财务指标(.*?)(?=\w{1,2}、)',re.DOTALL)
    subtext = p1.search(text)
    if subtext is None:
        p1 = re.compile('(\w{1,2})\s*主要会计数据(.*?)(?=(\w{1,2})\s*主要财务指标)',re.DOTALL)
        subtext = p1.search(text).group(0)
    else:
        subtext = p1.search(text).group(0)
    subp='([0-9,.%\- ]*?)\n' and '([0-9,.%\- ]*?)\s'
    psub='%s%s%s%s'%(subp,subp,subp,subp)
    p=re.compile('(\D+\n)+%s'%psub)
    lines=p.findall(subtext)    
    return(lines)
filenames = os.listdir()
list=['寒武纪','北方华创','中金科技','阿石创','澜起科技','富瀚微','博通集成','华微电子','雏鹰农牧','牧原股份']
import fitz
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei']  
plt.rcParams['axes.unicode_minus'] = False  

for i in range(10):
    prefix= list[i]
    pdf = [f for f in filenames if prefix[:3] in f and f.endswith('.pdf')]
    year = [p[-13:-9] for p in pdf]
    df_data=pd.DataFrame({'年份':year,
                      '年营业收入':''})
    for y in range(len(pdf)):
        lines=parse_data_line(pdf[y])
        df_fnc=pd.DataFrame([l for l in lines],columns=['',year[y]+'年',str(eval(year[y])-1)+'年','本年比上年增减',str(eval(year[y])-2)+'年']) 
        s=df_fnc.iloc[0,1]
        s=s.replace(',','')
        df_data['年营业收入'][[y]]=eval(s) 
    plt.figure()
    plt.plot(df_data['年份'],df_data['年营业收入'],label=u'年营业收入',color='b')
    plt.xlabel(u'(年)',fontsize=13)
    plt.ylabel(u'年营业收入(元)',fontsize=13,rotation=90)
    plt.legend(loc='best')
    plt.title(u'%s%s-%s年营业收入的可视化'%(prefix,str(year[0]),str(year[-1])),fontsize=13)
    plt.yticks(range(0,10**10,10**9))
    plt.grid(True)
    plt.show()
In [ ]:
year1=['2014','2015','2016','2017','2018','2019','2020']
for i in range(7):
    year2= year1[i]
    pdf = [f for f in filenames if year2 in f and f.endswith('.pdf')]
    fname = [p[:4] for p in pdf]
    df_data=pd.DataFrame(columns=['营收'],index= fname)
    df_data=df_data.fillna('')
    for y in range(len(fname)):
        lines=parse_data_line(pdf[y])
        df_fnc1=pd.DataFrame([l for l in lines],columns=['0','1','1','2','3']) 
        s=df_fnc1.iloc[0,1]
        s=s.replace(',','')
        df_data['营收'][[y]]=eval(s)
    plt.figure()
    plt.bar(x=df_data.index,height=df_data['营收'],label=u'年营业收入',color='r')
    plt.xlabel(u'(公司)',fontsize=13)
    plt.ylabel(u'年营业收入(元)',fontsize=13,rotation=90)
    plt.legend(loc='best')
    plt.title(u'十家半导体业行业公司%s年营业收入'%year2,fontsize=13)
    plt.yticks(range(0,10**10,10**9))
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()