from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def re_name(excel_name): # 去除名字中的特殊符号
sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
excel_name = excel_name.replace(char, '')
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
headers = {'User-Agent': random.choice(my_headers)}
def get_page(url): # 封装下载页面方法
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
response = session.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode("utf-8") # 应对乱码
except requests.exceptions.SSLError as e:
def down_load(data_end):
excel_name = data_end['title']
excel_organ = data_end['orgSName']
industryName = data_end['industryName']
down_loadurl = f"https://pdf.dfcfw.com/pdf/H3_{data_end['infoCode']}_1.pdf"
excel_name = re_name(excel_name)
excel_organ = re_name(excel_organ)
industryName = re_name(industryName)
local_folder = os.path.join(os.getcwd(), "reports") # 将所有报告存储在 "reports" 文件夹中
file_name = f"{industryName}-{excel_name}-{excel_organ}.pdf" # 文件名不包含日期
full_path = os.path.join(local_folder, file_name)
os.makedirs(local_folder, exist_ok=True)
if os.path.isfile(full_path):
print(f"文件已存在,跳过下载: {full_path}")
with open(full_path, 'wb') as code:
download_pdf = requests.get(down_loadurl)
code.write(download_pdf.content)
print(f"文件已保存为: {full_path}")
def page_data(pageno_num, start_date, end_date):
num_random_7 = random.randint(1000000, 9999999)
html_url = f'https://reportapi.eastmoney.com/report/list?cb=datatable{num_random_7}&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={start_date}&endTime={end_date}&pageNo={pageno}&fields=&qType=1&orgCode=&code=*&rcode=&_={time2}'
html = get_page(html_url)
html1 = html.strip(f'datatable{num_random_7}(')
html2 = html1.rstrip(')') # 去掉字符串字段
data_frist = json.loads(html2)
# 设置起始日期和结束日期(格式:YYYY-MM-DD)
start_date = "2024-11-22" # 替换为所需的起始日期
end_date = "2024-12-14" # 替换为所需的结束日期
data_frist = page_data(str(1), start_date, end_date)
value_list = data_frist.get('data', [])
TotalPage = data_frist.get('TotalPage', 1)
for f in range(1, TotalPage + 1):
data_frist = page_data(str(f), start_date, end_date)
value_list = data_frist.get('data', [])
for data_end in value_list:
down_load(data_end) # 下载每一篇报告