In [12]:
import re
import pandas as pd
import os
import openpyxl
import requests
import time

os.chdir('H:\Young')
xlsx = 'H:\Young\纺织与制造行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]
links_1=links[1:-1]
links_2=''.join(links_1)

sample='=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=425B4105F9CC&id=125834710&type=1","稳健医疗:2020年年度报告(英文版)")'

p=re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(links_2)

df2=pd.DataFrame({'link':[t[0] for t in list_of_tuple],'f_name':[t[1] for t in list_of_tuple]})


p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub('年年度报告',f) for f in df2.f_name]
df2['f_name']=f_names; del p, f_names
In [13]:
def filter_links(words,df,include=True):
   # No_words=len(words_exclude)
    ls=[]
    for word in words:
        if include:
           ls.append([word in f for f in df.f_name])
        else:
           ls.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
      flag=not include
      for c in range(len(words)):
          if include:
            flag=flag or ls[c][r]
          else:
            flag=flag and ls[c][r]
      index.append(flag)
    df2=df[index]
    return(df2)
df_all=filter_links(['摘要','英文版','问询函','审计','财务','风险','债券','报告书'],df2,include=False)
df_orig=filter_links(['(','('],df_all,include=False)
df_up=filter_links(['(','('],df_all,include=True)
df_up=filter_links(['取消'],df_up,include=False)
In [15]:
def sub_with_update(df_up,df_orig):
    df_newest=df_orig.copy()
    index_orig=[]
    index_up=[]
    for i, f in enumerate(df_orig.f_name):
            for j,fn in enumerate(df_up.f_name):
              if f in fn:
                index_orig.append(i)
                index_up.append(j)
            #j=j+1
       # i=i+1
    #return((index_orig,index_updt))
    for n in range(len(index_orig)):
        i=index_orig[n]
        j=index_up[n]
        df_orig.iloc[i,-2]=df_up.iloc[j,-2]
        #df_orig.iloc[i,-2]=df_up.iloc[j,-1]
    #df_newest=df_orig
    return(df_newest)
    #return()
In [54]:
#sub_with_update(df_up, df_orig)
df_newest=sub_with_update(df_up, df_orig)
#index_orig,index_up= sub_with_update(df_up, df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称']=[f[:4] for f in df_newest.f_name]
counts= df_newest['公司简称'].value_counts()

ten_company=[]
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')

for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)
    
ten_csv=os.listdir('10companies')
In [17]:
os.chdir(r'H:\Young\10companies')
f_1=os.listdir(r'H:\Young\10companies')


links= []
f_names=[]
#links = df['link']; f_names = df['f_name']

for f_2 in f_1:
    f_3 = pd.read_csv(f_2)
    for link in f_3['link']:
        links.append(link)
    for f_name in f_3['f_name']:
        f_names.append(f_name)
In [56]:
def get_PDF_url(url):
    r = requests.get(url);r.encoding = 'utf-8'; html = r.text
    r.close() # 已获取html内容,结束connection
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
    a = p.search(html) # 因第一个<a>即是目标标签,故用search
    if a is None:
        Warning('没有找到下载链接。请手动检查链接:%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href # 形成完整的链接
    return((href,fname))
In [57]:
hrefs=[];fnames=[]

for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    time.sleep(10)
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links.to_csv('links纺织行业.csv')

df_final_links=pd.read_csv('links纺织行业.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
In [18]:
for i in range(len(hrefs)):
    href=hrefs[i];f_name=f_names[i]
    r = requests.get(href, allow_redirects=True)
    open('%s' %f_name, 'wb').write(r.content)
    time.sleep(10)
r.close()
In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import csv

csvFile = open(r'H:\Young\10companies\纺织企业营收.csv')
reader = csv.reader(csvFile)
ls = []
for item in reader:
    ls.append(item)
csvFile.close()

df = pd.DataFrame(data=ls[1:], columns=ls[0])
df.index=pd.to_datetime(df.iloc[:,0])

df=df.astype('float')
In [3]:
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
plt.plot(df.index,df['华升股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [4]:
plt.plot(df.index,df['际华集团'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [5]:
plt.plot(df.index,df['华纺股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [6]:
plt.plot(df.index,df['上海三毛'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [7]:
plt.plot(df.index,df['棒杰股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [8]:
plt.plot(df.index,df['孚日股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [9]:
plt.plot(df.index,df['联发股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [10]:
plt.plot(df.index,df['新野纺织'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')
In [11]:
plt.plot(df.index,df['浔兴股份'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中纺织企业营收折线图')
plt.grid(True,axis='both')