1.购买服务器阿里云:服务器购买地址https://t.aliyun.com/U/E8o0aM若失效,可用地址
阿里云:
服务器购买地址
https://t.aliyun.com/U/E8o0aM若失效,可用地址
https://www.aliyun.com/daily-act/ecs/activity_selection?source=5176.29345612&userCode=49hts92d腾讯云:
https://curl.qcloud.com/wJpWmSfU若失效,可用地址
https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=ad201ee2ef3b771157f72ee5464b1fea&from=console华为云
https://activity.huaweicloud.com/cps.html?fromacct=64b5cf7cc11b4840bb4ed2ea0b2f4468&utm_source=V1g3MDY4NTY=&utm_medium=cps&utm_campaign=2019052.部署教程
3.代码如下
#!/usr/bin/env python3# -*- coding: utf-8 -*-import argparseimport csvimport hashlibimport jsonimport randomimport reimport timefrom dataclasses import dataclassfrom datetime import datetimefrom pathlib import Pathimport requestsfrom bs4 import BeautifulSoupfrom dateutil import tzfrom tqdm import tqdmBASE_API = "https://reportapi.eastmoney.com/report/list"DEFAULT_HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/124.0 Safari/537.36"),"Referer": "https://data.eastmoney.com/report/","Accept": "application/json,text/javascript,*/*;q=0.9","Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",}TIMEOUT = (10, 20)RETRY = 3MIN_VALID_PDF = 1024 # 更宽松些,避免误杀极小 PDFPDF_URL_RE = re.compile(rb'https://pdf\.dfcfw\.com/pdf/[^\s"\'<>]+\.pdf[^\s"\'<>]*',re.IGNORECASE)session = requests.Session()session.headers.update(DEFAULT_HEADERS)class ReportItem:title: strorg: strindustry: strstock: strpublish_date: strdetail_url: str | Nonepdf_url: str | Noneinfo_code: str | None# ---------------- 工具函数 ----------------def sanitize(name: str) -> str:name = re.sub(r"[\t\r\n]+", " ", name or "").strip()return re.sub(r'[\\/:*?"<>|]+', "_", name)def jsonp_to_json(text: str) -> dict:l = text.find("("); r = text.rfind(")")if l == -1 or r == -1:raise ValueError("非预期的 JSONP 响应")return json.loads(text[l+1:r])def build_params(begin, end, page_no, page_size, qtype, code, industry_code, org_code):cb = f"datatable{random.randint(1_000_000, 9_999_999)}"return {"cb": cb,"pageNo": page_no,"pageSize": page_size,"beginTime": begin,"endTime": end,"qType": qtype,"code": code or "*","industryCode": industry_code or "*","industry": "*","orgCode": org_code or "","rating": "*","ratingChange": "*","fields": "","am": "",}def fetch_jsonp(params: dict) -> dict:for _ in range(RETRY):try:r = session.get(BASE_API, params=params, timeout=TIMEOUT)if r.ok:return jsonp_to_json(r.text)except requests.RequestException:passtime.sleep(0.8)raise RuntimeError("接口请求失败或超过重试次数")def parse_item(raw: dict) -> ReportItem:return ReportItem(title = raw.get("title") or "",org = raw.get("orgSName") or raw.get("orgName") or "",industry = raw.get("industryName") or "",stock = raw.get("stockName") or "",publish_date = (raw.get("publishDate") or raw.get("publishTime") or "")[:10],detail_url = raw.get("url") or raw.get("researchUrl") or raw.get("pageUrl") or None,pdf_url = raw.get("pdfUrl") or raw.get("pdf") or None,info_code = raw.get("infoCode") or raw.get("id") or None)def extract_pdf_from_html(html: str) -> str | None:"""从详情页 HTML 中解析 PDF 链接"""# 1) 二进制全局正则(更鲁棒,覆盖 script 字符串)m = PDF_URL_RE.search(html.encode("utf-8", errors="ignore"))if m:return m.group(0).decode(errors="ignore")# 2) DOM 检索:<a href>、<iframe src>、<embed src>soup = BeautifulSoup(html, "html.parser")# a[href]for a in soup.find_all("a", href=True):href = a.get("href")if isinstance(href, str) and href.endswith(".pdf") and "pdf.dfcfw.com" in href:return href# iframe[src]for i in soup.find_all("iframe", src=True):src = i.get("src")if isinstance(src, str) and src.endswith(".pdf") and "pdf.dfcfw.com" in src:return src# embed[src]for e in soup.find_all("embed", src=True):src = e.get("src")if isinstance(src, str) and src.endswith(".pdf") and "pdf.dfcfw.com" in src:return srcreturn Nonedef get_pdf_url_from_detail(detail_url: str) -> str | None:for _ in range(RETRY):try:r = session.get(detail_url, timeout=TIMEOUT)if not r.ok:time.sleep(0.6); continuepdf = extract_pdf_from_html(r.text)if pdf:return pdfexcept requests.RequestException:time.sleep(0.6)return Nonedef pick_pdf_and_referer(item: ReportItem) -> tuple[str | None, str | None]:# 1) 接口直接给的if isinstance(item.pdf_url, str) and item.pdf_url.lower().endswith(".pdf"):return item.pdf_url, "https://data.eastmoney.com/report/"# 2) 详情页解析if item.detail_url and item.detail_url.startswith("http"):pdf = get_pdf_url_from_detail(item.detail_url)if pdf:return pdf, item.detail_url# 3) 兜底猜测if item.info_code:return f"https://pdf.dfcfw.com/pdf/H3_{item.info_code}_1.pdf", "https://data.eastmoney.com/report/"return None, Nonedef looks_like_pdf(content: bytes) -> bool:head = content[:2048].lstrip() # 忽略前导空白/BOMif b"<html" in content[:4096].lower():return Falsereturn head.startswith(b"%PDF-") and len(content) >= MIN_VALID_PDFdef needs_redownload(path: Path) -> bool:if not path.exists() or path.stat().st_size == 0:return Truetry:with path.open("rb") as f:data = f.read(2048)if not looks_like_pdf(data + b"0"*0): # 复用逻辑return Trueexcept Exception:return Truereturn Falsedef sha256_file(path: Path) -> str:h = hashlib.sha256()with path.open("rb") as f:for chunk in iter(lambda: f.read(1 << 20), b""):h.update(chunk)return h.hexdigest()def make_outpath(item: ReportItem, out_root: Path) -> Path:date_str = item.publish_date or "unknown-date"folder = out_root / date_strparts = [date_str, sanitize(item.industry or item.stock), sanitize(item.title), sanitize(item.org)]base = "-".join([p for p in parts if p]).strip("-") or "report"return folder / f"{base}.pdf"def write_manifest(csv_path: Path, row: dict):is_new = not csv_path.exists()with csv_path.open("a", newline="", encoding="utf-8") as f:w = csv.DictWriter(f, fieldnames=["ts","title","org","industry","stock","publishDate","detailUrl","pdfUrl","savedAs","sha256"])if is_new:w.writeheader()w.writerow(row)# ---------------- 主流程 ----------------def main():ap = argparse.ArgumentParser(description="东方财富研报 PDF 批量下载(加固版)")ap.add_argument("--begin", required=True, help="开始日期 YYYY-MM-DD")ap.add_argument("--end", required=True, help="结束日期 YYYY-MM-DD")ap.add_argument("--qtype", default="0", help="类别(默认0=综合;1=行业等)")ap.add_argument("--code", default=None, help="股票代码,如 600887")ap.add_argument("--industry_code", default=None, help="行业代码")ap.add_argument("--org", default=None, help="机构代码")ap.add_argument("--page_size", type=int, default=50, help="每页条数,默认50")ap.add_argument("--sleep", type=float, default=0.4, help="每页间隔,默认0.4s")ap.add_argument("--out", default="./em_reports", help="输出目录")ap.add_argument("--debug", action="store_true", help="打印调试信息")args = ap.parse_args()out_root = Path(args.out).resolve()out_root.mkdir(parents=True, exist_ok=True)manifest = out_root / "manifest.csv"# 第 1 页params = build_params(args.begin, args.end, 1, args.page_size, args.qtype, args.code, args.industry_code, args.org)first = fetch_jsonp(params)total_pages = int(first.get("TotalPage") or 1)total_size = int(first.get("size") or 0)print(f"时间段内共 {total_size} 条,约 {total_pages} 页;保存至:{out_root}")time.sleep(args.sleep)for page in tqdm(range(1, total_pages + 1), desc="Pages"):data = first if page == 1 else fetch_jsonp({**params, "pageNo": page})items = data.get("data") or []for raw in items:item = parse_item(raw)pdf_url, referer = pick_pdf_and_referer(item)if not pdf_url:if args.debug:print(f"⚠️ 未解析到 PDF:{item.title} | {item.detail_url}")continueout_path = make_outpath(item, out_root)if needs_redownload(out_path):headers = {"Referer": referer or "https://data.eastmoney.com/report/","Accept": "application/pdf,*/*;q=0.9","Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"}ok = Falsefor attempt in range(1, RETRY + 1):try:r = session.get(pdf_url, headers=headers, timeout=TIMEOUT, allow_redirects=True)content = r.contentif looks_like_pdf(content):out_path.parent.mkdir(parents=True, exist_ok=True)out_path.write_bytes(content)ok = Truebreakelse:if args.debug:print(f"❌ 非 PDF(尝试{attempt}/{RETRY}):{pdf_url} | "f"status={r.status_code} | ct={r.headers.get('Content-Type')} | "f"head={content[:200]!r}")except requests.RequestException as e:if args.debug:print(f"❌ 请求异常(尝试{attempt}/{RETRY}):{pdf_url} | {e}")time.sleep(0.8)# 失败再试:若当前来源于 infoCode 猜测,则去详情页解析;反之亦然if not ok and item.detail_url:alt = get_pdf_url_from_detail(item.detail_url)if alt and alt != pdf_url:try:r = session.get(alt, headers={"Referer": item.detail_url,"Accept":"application/pdf,*/*;q=0.9","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"},timeout=TIMEOUT, allow_redirects=True)content = r.contentif looks_like_pdf(content):out_path.parent.mkdir(parents=True, exist_ok=True)out_path.write_bytes(content)ok = Trueelif args.debug:print(f"❌ 备用链接仍非 PDF:{alt} | status={r.status_code} | "f"ct={r.headers.get('Content-Type')} | head={content[:200]!r}")except requests.RequestException as e:if args.debug:print(f"❌ 备用链接请求异常:{alt} | {e}")if not ok:print(f"❌ 下载失败:{item.title} | {pdf_url}")continuefile_hash = sha256_file(out_path)write_manifest(manifest, {"ts": datetime.now(tz.tzlocal()).isoformat(),"title": item.title,"org": item.org,"industry": item.industry,"stock": item.stock,"publishDate": item.publish_date,"detailUrl": item.detail_url or "","pdfUrl": pdf_url,"savedAs": str(out_path),"sha256": file_hash,})time.sleep(args.sleep)if __name__ == "__main__":main()
该脚本为东方财富研报批量下载pdf文件脚本。从东方财富研报接口按时间范围分页抓取研报清单,稳健地定位并批量下载 PDF 到本地:
PDF 获取优先级:接口
pdfUrl→ 详情页解析(a/iframe/embed/正则)→ 由infoCode组装兜底链接。断点续传/去重:已存在且有效的 PDF 跳过;小文件或非 PDF 自动重下。
反爬与可靠性:带 Referer、超时与重试、随机 JSONP 回调名;判定 PDF 头而非仅看扩展名。
输出 manifest.csv(含时间戳、标题、机构、行业、保存路径、SHA256 校验)。
提供命令行参数(时间、分页大小、股票/行业/机构过滤、输出目录、sleep、debug)。
主要方法
sanitize(name)
清洗文件名中非法字符,避免保存路径报错。jsonp_to_json(text)
解析东方财富 JSONP 响应为 JSON 对象。build_params(begin, end, page_no, page_size, qtype, code, industry_code, org_code)
构造/report/list查询参数(含随机cb回调名)。fetch_jsonp(params)
请求清单接口(带重试与超时),返回解析后的 JSON。parse_item(raw)→ReportItem
规范化一条研报记录(标题、机构、行业、股票、发布日期、详情页、pdfUrl、infoCode)。extract_pdf_from_html(html)
从详情页 HTML 多通道提取 PDF 链接:全文二进制正则匹配
pdf.dfcfw.com;2) DOM 扫描a[href]、iframe[src]、embed[src]。get_pdf_url_from_detail(detail_url)
拉取详情页并调用上面解析器,带重试。pick_pdf_and_referer(item)
确定下载用的 PDF 链接 与 Referer:优先接口给的;否则详情页解析;再不行用infoCode拼兜底 URL。looks_like_pdf(content)
有效性判定:2KB 头部含%PDF-且前 4KB 非 HTML,且长度 ≥MIN_VALID_PDF。needs_redownload(path)
本地文件检查:不存在/空文件/头部不像 PDF → 需要重下。sha256_file(path)
计算已保存文件的 SHA256,用于清单记录与校验。make_outpath(item, out_root)
以日期为子目录,文件名由「日期-行业/股票-标题-机构」组成,返回最终保存路径。write_manifest(csv_path, row)
追加写入manifest.csv,首写包含表头。main()
解析命令行参数 → 拉取第 1 页拿分页信息 → 逐页遍历:为每条记录决定 PDF 链接与 Referer;
若需下载则带 Referer 拉取,失败在 debug 下打印诊断并尝试备用链接;
成功后保存、计算哈希并写 manifest;
页面与请求间按
--sleep间隔节流。
注意:
本文部分变量已做脱敏处理,仅用于测试和学习研究,禁止用于商业用途,不能保证其合法性,准确性,完整性和有效性,请根据情况自行判断。技术层面需要提供帮助,可以通过打赏的方式进行探讨。
没有评论:
发表评论