Files
MADataManagment/exportExcelToDB_SH.py

433 lines
16 KiB
Python
Raw Normal View History

2025-08-15 23:52:12 +08:00
"""
2025-08-18 14:05:59 +08:00
存储上海证券交易股票列表数据
2025-08-15 23:52:12 +08:00
2025-08-18 14:05:59 +08:00
不确定其数据爬取规则防止 IP 被封
暂时使用该方案获取股票列表数据
下载excel,收到导入到数据库
2025-08-15 23:52:12 +08:00
"""
import pandas as pd
import os
import sys
import csv
import chardet # 用于检测文件编码
2025-08-18 14:05:59 +08:00
from pathlib import Path
from datetime import datetime
2025-08-15 23:52:12 +08:00
2025-08-18 14:05:59 +08:00
from MySQLHelper import MySQLHelper
from LogHelper import LogHelper
2025-08-15 23:52:12 +08:00
2025-08-20 17:30:14 +08:00
logger = LogHelper(logger_name = 'SH_Import').setup()
2025-08-15 23:52:12 +08:00
class StockDataImporter:
"""股票数据导入工具支持CSV"""
COLUMN_MAPPING = {
'A股代码': 'a_stock_code',
'B股代码': 'b_stock_code',
'证券简称': 'short_name',
'扩位证券简称': 'extended_name',
'公司英文全称': 'eng_name',
'上市日期': 'listing_date'
}
def __init__(self, data_dir: Path, db_config: dict):
self.data_dir = data_dir
self.db_config = db_config
self.df = None
self.csv_file = None
self.encoding = 'utf-8' # 默认编码
self.delimiter = ',' # 默认分隔符
def find_csv_file(self) -> Path:
"""在data文件夹中查找CSV文件"""
# 查找所有CSV文件
csv_files = list(self.data_dir.glob("GPLIST.csv"))
if not csv_files:
logger.error(f"{self.data_dir} 中没有找到CSV文件")
return None
# 如果有多个文件,选择最新的
if len(csv_files) > 1:
csv_files.sort(key=os.path.getmtime, reverse=True)
logger.info(f"找到多个CSV文件选择最新的: {csv_files[0].name}")
return csv_files[0]
def validate_file(self, file_path: Path) -> bool:
"""验证CSV文件是否有效"""
try:
if not file_path.exists():
logger.error(f"CSV文件不存在: {file_path}")
return False
file_size = file_path.stat().st_size
if file_size == 0:
logger.error(f"CSV文件为空: {file_path}")
return False
return True
except Exception as e:
logger.error(f"文件验证失败: {e}")
return False
def detect_file_encoding(self, file_path: Path) -> str:
"""检测文件编码"""
try:
# 读取文件开头部分进行编码检测
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # 读取前10KB
# 使用chardet检测编码
result = chardet.detect(raw_data)
encoding = result['encoding']
confidence = result['confidence']
# 常见编码替代
encoding_map = {
'GB2312': 'GBK',
'gb2312': 'GBK',
'ISO-8859-1': 'latin1',
'ascii': 'utf-8'
}
# 应用映射
encoding = encoding_map.get(encoding, encoding)
logger.info(f"检测到编码: {encoding} (置信度: {confidence:.2f})")
return encoding or 'utf-8'
except Exception as e:
logger.error(f"编码检测失败: {e}, 使用默认UTF-8")
return 'utf-8'
def detect_csv_delimiter(self, file_path: Path) -> str:
"""自动检测CSV分隔符"""
try:
# 使用检测到的编码打开文件
with open(file_path, 'r', encoding=self.encoding) as f:
# 读取前5行
lines = [f.readline() for _ in range(5) if f.readline()]
# 尝试常见分隔符
delimiters = [',', '\t', ';', '|']
delimiter_counts = {}
for delim in delimiters:
count = 0
for line in lines:
count += line.count(delim)
delimiter_counts[delim] = count
# 选择出现次数最多的分隔符
best_delim = max(delimiter_counts, key=delimiter_counts.get)
# 如果没有任何分隔符,则使用逗号
if delimiter_counts[best_delim] == 0:
logger.warning(f"无法检测到有效的分隔符,使用默认逗号分隔符")
return ','
logger.info(f"检测到分隔符: {repr(best_delim)}")
return best_delim
except Exception as e:
logger.error(f"检测分隔符失败: {e}, 使用默认逗号分隔符")
return ','
def read_csv_data(self, file_path: Path) -> bool:
"""从CSV文件读取数据"""
try:
# 1. 检测文件编码
self.encoding = self.detect_file_encoding(file_path)
# 2. 检测分隔符
self.delimiter = self.detect_csv_delimiter(file_path)
# 3. 读取CSV文件
logger.info(f"使用编码 '{self.encoding}' 和分隔符 '{self.delimiter}' 读取文件")
self.df = pd.read_csv(
file_path,
delimiter=self.delimiter,
dtype=str,
encoding=self.encoding,
on_bad_lines='warn',
quoting=csv.QUOTE_MINIMAL,
engine='python' # 更健壮的引擎
)
# 检查是否读取到数据
if self.df.empty:
logger.error("CSV文件没有包含有效数据")
return False
# 重命名列
self.df = self.df.rename(columns=self.COLUMN_MAPPING)
# 移除可能存在的空行
self.df = self.df.dropna(how='all')
logger.info(f"成功读取CSV数据{len(self.df)} 条记录")
return True
except UnicodeDecodeError:
# 尝试其他编码
encodings_to_try = ['GBK', 'latin1', 'ISO-8859-1', 'utf-16']
for enc in encodings_to_try:
try:
logger.warning(f"尝试使用 {enc} 编码读取文件")
self.df = pd.read_csv(
file_path,
delimiter=self.delimiter,
dtype=str,
encoding=enc
)
self.encoding = enc
logger.info(f"成功使用 {enc} 编码读取文件")
return True
except:
continue
logger.error("所有编码尝试均失败")
return False
except PermissionError:
logger.error(f"文件被占用,请关闭后重试: {file_path}")
return False
except Exception as e:
logger.error(f"读取CSV文件失败: {e}")
return False
def clean_stock_data(self) -> bool:
"""清洗股票数据"""
try:
# 处理B股代码将'-'转换为None
self.df['b_stock_code'] = self.df['b_stock_code'].replace('-', None)
# 格式化上市日期
self.df['listing_date'] = pd.to_datetime(
self.df['listing_date'],
format='%Y%m%d',
errors='coerce'
).dt.strftime('%Y-%m-%d')
# 检查日期转换是否成功
date_na_count = self.df['listing_date'].isna().sum()
if date_na_count > 0:
logger.warning(f"发现 {date_na_count} 条记录的上市日期格式不正确")
# 提取交易所信息
self.df['exchange'] = self.df['a_stock_code'].apply(
lambda x: 'SH' if str(x).startswith('60') else 'SZ' if str(x).startswith(('00', '30')) else 'OTHER'
)
# 验证A股代码格式
invalid_codes = self.df[~self.df['a_stock_code'].astype(str).str.match(r'^\d{6}$')]
if not invalid_codes.empty:
logger.warning(f"发现 {len(invalid_codes)} 条无效的A股代码")
logger.debug(f"无效代码示例: {invalid_codes['a_stock_code'].head().tolist()}")
logger.info("数据清洗完成")
return True
except Exception as e:
logger.error(f"数据清洗失败: {e}")
return False
def create_stocks_table(self, db: MySQLHelper) -> bool:
"""创建股票信息表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS stocks_sh (
a_stock_code VARCHAR(6) PRIMARY KEY COMMENT 'A股代码',
b_stock_code VARCHAR(6) COMMENT 'B股代码',
short_name VARCHAR(50) NOT NULL COMMENT '证券简称',
extended_name VARCHAR(100) COMMENT '扩位证券简称',
eng_name VARCHAR(150) COMMENT '公司英文全称',
listing_date DATE NOT NULL COMMENT '上市日期',
exchange VARCHAR(2) NOT NULL COMMENT '交易所',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='沪深股票信息表';
"""
try:
db.execute_update(create_table_sql)
logger.info("股票信息表创建成功")
return True
except Exception as e:
logger.error(f"创建表失败: {e}")
return False
def insert_data_to_db(self, db: MySQLHelper) -> bool:
"""将数据插入数据库"""
if self.df is None or self.df.empty:
logger.error("没有有效数据可插入")
return False
# 准备SQL语句支持重复记录更新
insert_sql = """
INSERT INTO stocks_sh (
a_stock_code, b_stock_code, short_name,
extended_name, eng_name, listing_date, exchange
) VALUES (
%s, %s, %s, %s, %s, %s, %s
)
ON DUPLICATE KEY UPDATE
b_stock_code = VALUES(b_stock_code),
short_name = VALUES(short_name),
extended_name = VALUES(extended_name),
eng_name = VALUES(eng_name),
listing_date = VALUES(listing_date),
exchange = VALUES(exchange)
"""
# 准备参数列表
params_list = []
for _, row in self.df.iterrows():
# 处理可能的NaN值
listing_date = row['listing_date'] if pd.notna(row['listing_date']) else '1970-01-01'
params_list.append((
row['a_stock_code'],
row['b_stock_code'] if pd.notna(row['b_stock_code']) else None,
row['short_name'],
row['extended_name'] if pd.notna(row['extended_name']) else None,
row['eng_name'] if pd.notna(row['eng_name']) else None,
listing_date,
row['exchange']
))
# 批量执行插入
try:
total_rows = len(params_list)
if total_rows == 0:
logger.error("没有有效数据可插入")
return False
batch_size = 1000 # 每批插入1000条记录
logger.info(f"开始插入数据,共 {total_rows} 条记录")
# 分批插入,避免大事务问题
for i in range(0, total_rows, batch_size):
batch_params = params_list[i:i+batch_size]
affected_rows = db.execute_many(insert_sql, batch_params)
logger.info(f"已处理 {min(i+batch_size, total_rows)}/{total_rows} 条记录")
logger.info(f"成功插入/更新 {total_rows} 条记录")
return True
except Exception as e:
logger.error(f"插入数据失败: {e}")
# 记录前5个参数以帮助调试
if params_list:
logger.debug(f"前5个参数示例: {params_list[:5]}")
return False
def verify_data_in_db(self, db: MySQLHelper, sample_size: int = 5) -> bool:
"""验证数据库中的数据"""
try:
# 检查记录总数
count_sql = "SELECT COUNT(*) AS total FROM stocks_sh"
result = db.execute_query(count_sql)
db_count = result[0]['total'] if result else 0
logger.info(f"数据库中共有 {db_count} 条记录")
# 随机抽样检查
sample_sql = f"""
SELECT a_stock_code, short_name, listing_date
FROM stocks_sh
ORDER BY RAND()
LIMIT {sample_size}
"""
samples = db.execute_query(sample_sql)
logger.info("\n随机抽样记录:")
for idx, sample in enumerate(samples, 1):
logger.info(f"{idx}. {sample['a_stock_code']}: {sample['short_name']} ({sample['listing_date']})")
return True
except Exception as e:
logger.error(f"数据验证失败: {e}")
return False
def run_import(self) -> bool:
"""执行完整的导入流程"""
logger.info(f"开始导入股票数据,数据目录: {self.data_dir}")
start_time = datetime.now()
# 1. 查找CSV文件
csv_file = self.find_csv_file()
if not csv_file:
return False
# 2. 验证文件
if not self.validate_file(csv_file):
return False
# 3. 读取CSV数据
if not self.read_csv_data(csv_file):
return False
# 4. 清洗数据
if not self.clean_stock_data():
return False
# 显示前5条数据
logger.info("\n前5条股票数据:")
for i, row in self.df.head().iterrows():
logger.info(f"{row['a_stock_code']}: {row['short_name']} ({row['listing_date']})")
# 5. 连接数据库并导入
try:
with MySQLHelper(**self.db_config) as db:
# 5.1 创建表
if not self.create_stocks_table(db):
return False
# 5.2 插入数据
if not self.insert_data_to_db(db):
return False
# 5.3 验证数据
if not self.verify_data_in_db(db):
return False
except Exception as e:
logger.error(f"数据库操作异常: {e}")
return False
# 计算执行时间
duration = datetime.now() - start_time
logger.info(f"数据处理成功完成! 总耗时: {duration.total_seconds():.2f}")
return True
if __name__ == "__main__":
# 数据库配置
db_config = {
'host': 'localhost',
'user': 'root',
'password': 'bzskmysql',
'database': 'fullmarketdata_a'
}
# 获取当前脚本所在目录
current_dir = Path(__file__).parent if "__file__" in locals() else Path.cwd()
# 设置数据目录
DATA_DIR = current_dir / "data"
# 确保data目录存在
DATA_DIR.mkdir(exist_ok=True, parents=True)
# 安装依赖 (如果chardet未安装)
try:
import chardet
except ImportError:
logger.info("安装chardet库以支持编码检测...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "chardet"])
import chardet
# 创建导入器并执行导入
importer = StockDataImporter(DATA_DIR, db_config)
if importer.run_import():
logger.info("股票数据导入成功!")
else:
logger.error("股票数据导入失败,请检查日志了解详情")