罗妙璇的综合实验

Ⅰ.爬取所分配上市公司年报

导入本部分所需模块


from time import sleep
from selenium.webdriver.common.by import By
from selenium import webdriver

代码

  

 options = webdriver.ChromeOptions()
out_path = r'E:\py_projects\data'  
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': out_path}
options.add_experimental_option('prefs', prefs)


def spider(stock):
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    url = "https://www.szse.cn/disclosure/listed/notice/index.html"
    driver.implicitly_wait(10)
    driver.get(url)
    sleep(5)
    code = driver.find_element(By.XPATH, '//*[@id="input_code"]')
    code.clear()
    code.send_keys(stock)
    driver.implicitly_wait(10)
    driver.find_element(By.XPATH, '//*[@id="c-typeahead-menu-1"]').click()
    sleep(1)
    driver.find_element(By.XPATH, '//*[@id="select_gonggao"]/div/div/a/span[1]').click()
    sleep(1)
    driver.find_element(By.XPATH, '//*[@id="c-selectex-menus-3"]/li[1]/a').click()
    sleep(1)
    driver.find_element(By.XPATH, '//*[@id="query-btn"]').click()
    sleep(1)
    for page in range(1, 22):
        driver.find_element(By.XPATH,
                            '//*[@id="disclosure-table"]/div/div[1]/div/table/tbody/tr[' + str(
                                page) + ']/td[3]/div/a/span[3]').click()
        sleep(5)
        driver.execute_script(f'window.scrollTo(0,200)')
    print(stock + '/公司年报下载完成!')
    sleep(1)
    driver.quit()


# '000039', '000055'
# '000778', '000890', '000969', '002026', '002032', '002084', '002132', '002135 '
list = ['002132']
for li in list:
    sipder(li)

  
  

结果

获取结果截图

Ⅱ.通过年报获取公司营业信息

对年报进行数据提取

    
import re
import pandas as pd
import fitz
    
    

数据获取

    
import PyPDF2
import re

import pandas as pd


def search_keyword(data, keyword):
    pattern = re.compile(keyword, re.IGNORECASE)
    found_indexes = []
    for index, item in enumerate(data):
        if re.search(pattern, str(item)):
            found_indexes.append(index)
    return found_indexes


li_meigu = []
li_yingye = []


def read_pdf1(file_path1):
    with open(file_path1, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        def meigu():
            # 获取基本每股收益
            page = pdf_reader.pages[6]
            page_text = page.extract_text()
            lines = page_text.split('\n')
            ree = search_keyword(lines, '基本每股收益')
            # print(lines[ree[0] + 1])
            index = lines[ree[0] + 1]
            value = index.split(" ")
            filtered_data = [x for x in value if x is not None and x != '']
            print('2022年每股收益', filtered_data[1])
            print('2021年每股收益', filtered_data[2])
            print('2020年每股收益', filtered_data[4])
            # li_meigu.append(filtered_data[1])
            # li_meigu.append(filtered_data[2])
            # li_meigu.append(filtered_data[4])

        def yingye():
            # 获取营业收入
            page = pdf_reader.pages[6]
            page_text = page.extract_text()
            lines = page_text.split('\n')
            # first_line = lines[12]
            ree = search_keyword(lines, '营业收入')
            # print(ree)
            index = lines[ree[0]]
            value = index.split(" ")
            print(value)
            filtered_data = [x for x in value if x is not None and x != '']
            print('2022年营业收入', filtered_data[1])
            print('2021年营业收入', filtered_data[2])
            print('2020年营业收入', filtered_data[4])
            # li_yingye.append(filtered_data[1])
            # li_yingye.append(filtered_data[2])
            # li_yingye.append(filtered_data[4])

        meigu()
        yingye()


# file_path1 = "data/新兴铸管:2022年年度报告.PDF"  # 替换为实际的PDF文件路径
# read_pdf1(file_path1)


def read_pdf2(file_path2):
    with open(file_path2, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        def meigu():
            # 获取基本每股收益
            page = pdf_reader.pages[11]
            page_text = page.extract_text()
            lines = page_text.split('\n')
            ree = search_keyword(lines, '归属于母公司股东的基本每股收益')
            # print(lines[ree[0] + 1])
            index = lines[ree[0] + 1]
            value = index.split(" ")
            filtered_data = [x for x in value if x is not None and x != '']
            print('2017年每股收益', filtered_data[1])
            print('2016年每股收益', filtered_data[2])
            print('2015年每股收益', filtered_data[4])
            li_meigu.append(filtered_data[1])
            li_meigu.append(filtered_data[2])
            li_meigu.append(filtered_data[4])

        def yingye():
            # 获取营业收入
            page = pdf_reader.pages[10]
            page_text = page.extract_text()
            lines = page_text.split('\n')
            ree = search_keyword(lines, '营业收入')
            index = lines[ree[0]]
            value = index.split(" ")
            print(value)
            filtered_data = [x for x in value if x is not None and x != '']
            print('2017年营业收入', filtered_data[1])
            print('2016年营业收入', filtered_data[2])
            print('2015年营业收入', filtered_data[4])
            li_yingye.append(filtered_data[1])
            li_yingye.append(filtered_data[2])
            li_yingye.append(filtered_data[4])

        meigu()
        yingye()
    return li_meigu, li_yingye


# file_path2 = "data/新兴铸管:2019年年度报告.PDF"  # 替换为实际的PDF文件路径


# read_pdf2(file_path2)
    
    

10所上市公司所需数据汇总

    
def file():
    date = []
    li = ['北京市海淀区学院南路 76 号', '000969', '安泰科技',
          'http://www.atmcn.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['7,406,006,283.25', '6,327,072,471.05', '5,112,814,901.95', '4,780,216,900.36',
                  '5,054,086,059.15', '4,659,656,425.17', '3,921,200,536.67', '3,758,663,488.36',
                  '4,155,698,392.31', '3,848,446,118.08']
    li_meigu1 = ['0.2057', '0.1687', '0.1011', '0.1606', '-0.2121',
                 '0.0577', '0.0788', '0.1015', '-0.2201',
                 '0.073']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['浙江省杭州市萧山区衙前镇衙前路 593 号', '002135', '东南网架',
          'http://www.dongnanwangjia.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['12,064,434,647.04', '11,287,107,272.03', '9,256,289,931.66', '8,976,374,629.37',
                  '8,694,640,505.76', '7,791,528,910.40', '5,738,468,159.88', '5,196,043,096.55',
                  '4,226,868,489.63', '3,720,788,318.32']
    li_meigu1 = ['0.25', '0.48', '0.26', '0.26', '0.17',
                 '0.12', '0.06', '0.05', '0.09',
                 '0.08']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['江苏省江阴市澄江中路 165 号', '000890', '法尔胜',
              'https://www.chinafasten.cn/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['677,575,109.69', '667,468,092.57', '467,058,989.47', '1,003,609,325.63',
                  '1,698,089,206.54', '2,004,295,820.98', '1,906,735,239.66', '1,426,542,116.44',
                  '1,552,094,711.88', '1,566,444,923.84']
    li_meigu1 = ['-0.03 ', '0.10', '0.04', '-2.05', '-0.38',
                 '0.38', '0.37', '0.0146', '0.0137',
                 '0.0169']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['深圳市南山区龙珠四路 2 号方大城 T1 栋 39 层', '000055', '方大集团',
              'http://www.fangda.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['3,846,975,948.44', '3,557,724,397.54', '3,000,191,773.63', '3,005,749,558.66',
                  '3,048,680,152.06', '2,947,470,813.58', '4,203,866,173.72', '2,550,467,494.78',
                  '1,938,324,435.51', '1,747,620,845.74']
    li_meigu1 = ['0.26', '0.21', '0.35', '0.310', '1.91',
                 '0.97', '0.91', '0.14', '0.13',
                 '0.11']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['广州市番禺区沙头街禺山西路 363 号联邦工业城内', '002084', '海鸥住工',
              'http://www.seagullgroup.cn/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['3,294,840,644.48', '4,125,639,722.35', '3,340,050,002.51', '2,569,424,298.56',
                  '2,224,695,145.25', '2,070,648,154.20', '1,786,562,163.94', '1,714,909,434.79',
                  '1,649,972,494.54', '1,675,669,916.22']
    li_meigu1 = ['0.0734', '0.1319', '0.2830', '0.2634', '0.0828',
                 '0.1816', '0.1862', '0.1102', '0.1013',
                 '0.1']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['河南省巩义市恒星工业园区', '002132', '恒星科技',
          'https://www.hengxingchinese.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['4,417,416,588.93 ', '3,396,281,106.63', '2,832,759,589.33', '3,386,147,599.09',
                  '3,014,331,960.05', '3,046,175,236.90', '2,064,455,574.55', '1,735,798,925.07',
                  '1,876,574,822.39', '1,749,087,962.71']
    li_meigu1 = ['0.14', '0.11', '0.10', '0.07', '-0.11 ',
                 '0.05', '0.09', '0.04', '0.08',
                 '0.06']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['山东省威海临港经济技术开发区苘山镇中韩路 2 号', '002026', '山东威达',
          'http://www.weidapeacock.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['2,467,176,580.78', '3,309,710,388.26', '2,165,052,831.80', '1,575,109,439.78',
                  '1,661,996,645.79', '1,469,380,404.82', '1,180,581,447.47', '837,445,261.00',
                  '837,445,261.00', '718,220,284.14']
    li_meigu1 = ['0.47', '0.89', '0.60', '-0.28', '0.37',
                 '0.30', '0.26', '0.25', '0.34',
                 '0.2011']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['中国杭州高新技术产业区江晖路 1772 号苏泊尔大厦 15 层', '002032', '苏泊尔', 'https://www.supor.com.cn/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['20,170,527,516.66', '21,585,331,407.47', '18,596,944,289.02', '19,853,477,882.97',
                  '17,851,264,801.72', '14,542,193,769.70', '11,947,123,201.12', '10,909,686,625.90',
                  '9,534,643,945.84', '8,383,249,626.61']
    li_meigu1 = ['2.565', '2.400', '2.264', '2.347', '2.041',
                 '1.623', '1.712', '1.413', '1.096',
                 '0.927']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()

def file():
    date = []
    li = ['河北省武安市上洛阳村北(2672 厂区)', '000778', '新兴铸管', 'https://www.xinxing-pipes.com/']
    for i in range(2013, 2023):
        date.append(i)
    date.reverse()

    li_yingye1 = ['47,760,058,256.98', '53,301,106,059.49', '42,960,921,062.41', '40,889,707,108.27',
                 '40,547,120,305.78', '41,266,372,331.97', '52,159,883,504.68', '50,030,639,751.47',
                 '60,793,273,381.55', '63,014,443,991.26']
    li_meigu1 = ['0.4200', '0.5028', '0.4541', '0.3751', '0.5265', '0.2779', '0.1208', '0.1646', '0.2292', '0.5263']
    # print(date)
    # print(li_meigu, li_yingye)

    data = {
        '股票简称': li[2],
        '股票代码': li[1],
        '公司地址': li[0],
        '公司网址': li[3],
        '营业收入': li_yingye1,
        '每股收益': li_meigu1
    }

    df = pd.DataFrame(data, columns=['股票简称', '股票代码', '公司地址', '公司网址', '营业收入', '每股收益'])

    df.to_excel('表格/' + li[2] + '.xlsx', index=False, encoding='utf-8')


file()
    
    

结果

结果截图 结果截图 结果截图 结果截图

根据爬虫数据绘图分析

    
import pandas as pd  # 导入数据统计模块
import matplotlib  # 导入图表模块
import matplotlib.pyplot as plt  # 导入绘图模块
import numpy as np

# 避免中文乱码
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 设置字体为SimHei显示中文
matplotlib.rcParams['axes.unicode_minus'] = False  # 设置正常显示字符,使用rc配置文件来自定义

date = []
for i in range(2013, 2023):
    date.append(i)
date.reverse()

# comInfoList = []
name = ['中集集团', '新兴铸管', '苏泊尔', '山东威达', '恒星科技', '海鸥住工', '方大集团', '法尔胜', '东南网架', '安泰科技']


def zhexian_yingye():
    plt.figure()

    for i in range(len(name)):
        data = pd.read_excel('表格/' + str(name[i]) + '.xlsx')  # 读取csv数据
        li = []
        df_yingye = data['营业收入']
        for j in range(0, 10):
            y = round(float(df_yingye[j].replace(',', '')) / 1000000, 2)
            li.append(y)
        # df_meigu = data['每股收益']
        plt.plot(date, li, label=name[i])

    # 添加图例
    plt.legend()

    # 设置标题和轴标签
    plt.title('2013-2020年营业收入折线图')
    plt.xlabel('年份')
    plt.ylabel('营业收入(百万元)')
    plt.show()


def zhexian_meigu():
    plt.figure()

    for i in range(len(name)):
        data = pd.read_excel('表格/' + str(name[i]) + '.xlsx')  # 读取csv数据
        li = []
        df_yingye = data['每股收益']
        # for j in range(0, 10):
        #     y = round(float(df_yingye[j].replace(',', '')) / 1000000, 2)
        #     li.append(y)
        plt.plot(date, df_yingye, label=name[i])

    # 添加图例
    plt.legend()

    # 设置标题和轴标签
    plt.title('2013-2020年每股收益折线图')
    plt.xlabel('年份')
    plt.ylabel('元')
    plt.show()


def bar_meigu():
    categories = date
    li = []
    for i in range(len(name)):
        data = pd.read_excel('表格/' + str(name[i]) + '.xlsx')  # 读取csv数据
        df_yingye = data['每股收益']
        li.append(df_yingye)

    # print(len(li))
    values = []
    for j in range(len(li)):
        values.append(li[j])
    # 设置柱状图的宽度
    bar_width = 0.1

    # 创建一个新的图形
    plt.figure()

    # 计算每个柱状图的位置
    x = np.arange(len(categories))

    # 循环绘制十个柱状图
    for i in range(len(values)):
        plt.bar(x + i * bar_width, values[i], width=bar_width, label=name[i])

    # 设置x轴标签和刻度
    plt.xlabel('年份')
    plt.xticks(x + (len(values) / 2 - 0.5) * bar_width, categories)

    # 设置y轴标签
    plt.ylabel('元')

    # 添加图例
    plt.legend()

    # 设置标题
    plt.title('每股收益柱状图')

    # 显示图形
    plt.show()


def bar_yingye():
    categories = date
    li = []
    for i in range(len(name)):
        data = pd.read_excel('表格/' + str(name[i]) + '.xlsx')  # 读取csv数据
        df_yingye = data['营业收入']
        lt = []
        for j in range(0, 10):
            y = round(float(df_yingye[j].replace(',', '')) / 1000000, 2)
            lt.append(y)
        li.append(lt)

    values = []
    for j in range(len(li)):
        values.append(li[j])
    # 设置柱状图的宽度
    bar_width = 0.1

    # 创建一个新的图形
    plt.figure()

    # 计算每个柱状图的位置
    x = np.arange(len(categories))

    # 循环绘制十个柱状图
    for i in range(len(values)):
        plt.bar(x + i * bar_width, values[i], width=bar_width, label=name[i])

    # 设置x轴标签和刻度
    plt.xlabel('年份')
    plt.xticks(x + (len(values) / 2 - 0.5) * bar_width, categories)

    # 设置y轴标签
    plt.ylabel('百万')

    # 添加图例
    plt.legend()

    # 设置标题
    plt.title('营业收益柱状图')

    # 显示图形
    plt.show()


# bar_yingye()
# bar_meigu()
zhexian_meigu()
zhexian_yingye()
    
    

结果

结果截图 结果截图 结果截图 结果截图

结果解读

从营业收益柱状图来看,新兴铸管以绝对的优势领先其余九家企业,甚至是于排在第二的苏泊尔的几倍甚至于10倍营业收入,势头较为明显的还有苏泊尔和东南网架,其余几家差异较小。从营业收益折线图来看,新兴铸管虽然遥遥领先,但10年来呈现向下的趋势,纵观其余9家企业,虽然营业收入势头不明显,但在10年大都呈现向上的趋势。 从每股收益的柱状图分析,与营业收入呈相反趋势,苏泊尔跃升首位,与其他公司差异显著。并且可以明显看出新兴铸管的每股收益相对其他九家企业较低。方大集团在2017年以前的每股收益不断爬升,并在2018年达到峰值但在2018年后每股收益不断下降。从每股收益的折线图来看,苏泊尔在的每股收益在10年来不断升高。中集集团和法尔胜10年来波动性较大分别于2018年和2019年呈现较为剧烈的波动。 综合以上分析,新兴筹管的营业收入数倍于其他企业,但每股收益走势平平,可以推测其留存收益较高,注重公司的长远发展。特征较为明显的苏泊尔,无论是营业收入还是每股收益,表现都优于大多企业,发展公司的同时提高股东收益。除了方大集团与法尔胜波动性较大之外,其余企业10年来数值上的表现呈现相同的走势。

Ⅴ.总结

此次大作业可以说是大学以来难度最高的一次作业,但是老师上课非常细心并且会认真解答每一个有疑惑的同学,上课虽然是三节课,但是每次都感觉时光飞逝,同时每个学到的知识点都是很有用的干货。入学以来第一次如此深入的学习python知识并实际操作。了解如何从深交所/上交所获取需要的信息,如何用函数爬取需要的数据,可以说是开启了爬虫领域的大门。金融、计算机、数学,三者密不可分,这门课的学习让我通过实际操作,接触到书本上学到过的公式和会计分录,按照书本学和实际操作处理有很大不同,贴近实际的操作更能感受到数据的走势与变化。很幸运选了这门课,也很感谢吴老师的耐心教学!

THE END