1.购买服务器阿里云：服务器购买地址https://t.aliyun.com/U/E8o0aM若失效，可用地址

1.购买服务器

阿里云：

服务器购买地址

https://t.aliyun.com/U/E8o0aM

若失效，可用地址

https://www.aliyun.com/daily-act/ecs/activity_selection?source=5176.29345612&userCode=49hts92d

腾讯云：

https://curl.qcloud.com/wJpWmSfU

若失效，可用地址

https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=ad201ee2ef3b771157f72ee5464b1fea&from=console

华为云

https://activity.huaweicloud.com/cps.html?fromacct=64b5cf7cc11b4840bb4ed2ea0b2f4468&utm_source=V1g3MDY4NTY=&utm_medium=cps&utm_campaign=201905

2.部署教程

2024年最新青龙面板跑脚本教程（一）持续更新中

3.代码如下

#!/usr/bin/env python3# -*- coding: utf-8 -*-
import argparseimport csvimport hashlibimport jsonimport randomimport reimport timefrom dataclasses import dataclassfrom datetime import datetimefrom pathlib import Path
import requestsfrom bs4 import BeautifulSoupfrom dateutil import tzfrom tqdm import tqdm
BASE_API = "https://reportapi.eastmoney.com/report/list"DEFAULT_HEADERS = {    "User-Agent": (        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "        "AppleWebKit/537.36 (KHTML, like Gecko) "        "Chrome/124.0 Safari/537.36"    ),    "Referer": "https://data.eastmoney.com/report/",    "Accept": "application/json,text/javascript,*/*;q=0.9",    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",}TIMEOUT = (10, 20)RETRY = 3MIN_VALID_PDF = 1024  # 更宽松些，避免误杀极小 PDFPDF_URL_RE = re.compile(    rb'https://pdf\.dfcfw\.com/pdf/[^\s"\'<>]+\.pdf[^\s"\'<>]*',    re.IGNORECASE)
session = requests.Session()session.headers.update(DEFAULT_HEADERS)
@dataclassclass ReportItem:    title: str    org: str    industry: str    stock: str    publish_date: str    detail_url: str | None    pdf_url: str | None    info_code: str | None
# ---------------- 工具函数 ----------------def sanitize(name: str) -> str:    name = re.sub(r"[\t\r\n]+", " ", name or "").strip()    return re.sub(r'[\\/:*?"<>|]+', "_", name)
def jsonp_to_json(text: str) -> dict:    l = text.find("("); r = text.rfind(")")    if l == -1 or r == -1:        raise ValueError("非预期的 JSONP 响应")    return json.loads(text[l+1:r])
def build_params(begin, end, page_no, page_size, qtype, code, industry_code, org_code):    cb = f"datatable{random.randint(1_000_000, 9_999_999)}"    return {        "cb": cb,        "pageNo": page_no,        "pageSize": page_size,        "beginTime": begin,        "endTime": end,        "qType": qtype,        "code": code or "*",        "industryCode": industry_code or "*",        "industry": "*",        "orgCode": org_code or "",        "rating": "*",        "ratingChange": "*",        "fields": "",        "am": "",    }
def fetch_jsonp(params: dict) -> dict:    for _ in range(RETRY):        try:            r = session.get(BASE_API, params=params, timeout=TIMEOUT)            if r.ok:                return jsonp_to_json(r.text)        except requests.RequestException:            pass        time.sleep(0.8)    raise RuntimeError("接口请求失败或超过重试次数")
def parse_item(raw: dict) -> ReportItem:    return ReportItem(        title = raw.get("title") or "",        org = raw.get("orgSName") or raw.get("orgName") or "",        industry = raw.get("industryName") or "",        stock = raw.get("stockName") or "",        publish_date = (raw.get("publishDate") or raw.get("publishTime") or "")[:10],        detail_url = raw.get("url") or raw.get("researchUrl") or raw.get("pageUrl") or None,        pdf_url = raw.get("pdfUrl") or raw.get("pdf") or None,        info_code = raw.get("infoCode") or raw.get("id") or None    )
def extract_pdf_from_html(html: str) -> str | None:    """从详情页 HTML 中解析 PDF 链接"""    # 1) 二进制全局正则（更鲁棒，覆盖 script 字符串）    m = PDF_URL_RE.search(html.encode("utf-8", errors="ignore"))    if m:        return m.group(0).decode(errors="ignore")
    # 2) DOM 检索：<a href>、<iframe src>、<embed src>    soup = BeautifulSoup(html, "html.parser")    # a[href]    for a in soup.find_all("a", href=True):        href = a.get("href")        if isinstance(href, str) and href.endswith(".pdf") and "pdf.dfcfw.com" in href:            return href    # iframe[src]    for i in soup.find_all("iframe", src=True):        src = i.get("src")        if isinstance(src, str) and src.endswith(".pdf") and "pdf.dfcfw.com" in src:            return src    # embed[src]    for e in soup.find_all("embed", src=True):        src = e.get("src")        if isinstance(src, str) and src.endswith(".pdf") and "pdf.dfcfw.com" in src:            return src
    return None
def get_pdf_url_from_detail(detail_url: str) -> str | None:    for _ in range(RETRY):        try:            r = session.get(detail_url, timeout=TIMEOUT)            if not r.ok:                time.sleep(0.6); continue            pdf = extract_pdf_from_html(r.text)            if pdf:                return pdf        except requests.RequestException:            time.sleep(0.6)    return None
def pick_pdf_and_referer(item: ReportItem) -> tuple[str | None, str | None]:    # 1) 接口直接给的    if isinstance(item.pdf_url, str) and item.pdf_url.lower().endswith(".pdf"):        return item.pdf_url, "https://data.eastmoney.com/report/"    # 2) 详情页解析    if item.detail_url and item.detail_url.startswith("http"):        pdf = get_pdf_url_from_detail(item.detail_url)        if pdf:            return pdf, item.detail_url    # 3) 兜底猜测    if item.info_code:        return f"https://pdf.dfcfw.com/pdf/H3_{item.info_code}_1.pdf", "https://data.eastmoney.com/report/"    return None, None
def looks_like_pdf(content: bytes) -> bool:    head = content[:2048].lstrip()  # 忽略前导空白/BOM    if b"<html" in content[:4096].lower():        return False    return head.startswith(b"%PDF-") and len(content) >= MIN_VALID_PDF
def needs_redownload(path: Path) -> bool:    if not path.exists() or path.stat().st_size == 0:        return True    try:        with path.open("rb") as f:            data = f.read(2048)        if not looks_like_pdf(data + b"0"*0):  # 复用逻辑            return True    except Exception:        return True    return False
def sha256_file(path: Path) -> str:    h = hashlib.sha256()    with path.open("rb") as f:        for chunk in iter(lambda: f.read(1 << 20), b""):            h.update(chunk)    return h.hexdigest()
def make_outpath(item: ReportItem, out_root: Path) -> Path:    date_str = item.publish_date or "unknown-date"    folder = out_root / date_str    parts = [date_str, sanitize(item.industry or item.stock), sanitize(item.title), sanitize(item.org)]    base = "-".join([p for p in parts if p]).strip("-") or "report"    return folder / f"{base}.pdf"
def write_manifest(csv_path: Path, row: dict):    is_new = not csv_path.exists()    with csv_path.open("a", newline="", encoding="utf-8") as f:        w = csv.DictWriter(f, fieldnames=[            "ts","title","org","industry","stock","publishDate","detailUrl","pdfUrl","savedAs","sha256"        ])        if is_new:            w.writeheader()        w.writerow(row)
# ---------------- 主流程 ----------------def main():    ap = argparse.ArgumentParser(description="东方财富研报 PDF 批量下载（加固版）")    ap.add_argument("--begin", required=True, help="开始日期 YYYY-MM-DD")    ap.add_argument("--end", required=True, help="结束日期 YYYY-MM-DD")    ap.add_argument("--qtype", default="0", help="类别（默认0=综合；1=行业等）")    ap.add_argument("--code", default=None, help="股票代码，如 600887")    ap.add_argument("--industry_code", default=None, help="行业代码")    ap.add_argument("--org", default=None, help="机构代码")    ap.add_argument("--page_size", type=int, default=50, help="每页条数，默认50")    ap.add_argument("--sleep", type=float, default=0.4, help="每页间隔，默认0.4s")    ap.add_argument("--out", default="./em_reports", help="输出目录")    ap.add_argument("--debug", action="store_true", help="打印调试信息")    args = ap.parse_args()
    out_root = Path(args.out).resolve()    out_root.mkdir(parents=True, exist_ok=True)    manifest = out_root / "manifest.csv"
    # 第 1 页    params = build_params(args.begin, args.end, 1, args.page_size, args.qtype, args.code, args.industry_code, args.org)    first = fetch_jsonp(params)    total_pages = int(first.get("TotalPage") or 1)    total_size = int(first.get("size") or 0)    print(f"时间段内共 {total_size} 条，约 {total_pages} 页；保存至：{out_root}")    time.sleep(args.sleep)
    for page in tqdm(range(1, total_pages + 1), desc="Pages"):        data = first if page == 1 else fetch_jsonp({**params, "pageNo": page})        items = data.get("data") or []        for raw in items:            item = parse_item(raw)            pdf_url, referer = pick_pdf_and_referer(item)            if not pdf_url:                if args.debug:                    print(f"⚠️ 未解析到 PDF：{item.title} | {item.detail_url}")                continue
            out_path = make_outpath(item, out_root)
            if needs_redownload(out_path):                headers = {"Referer": referer or "https://data.eastmoney.com/report/",                           "Accept": "application/pdf,*/*;q=0.9",                           "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"}                ok = False                for attempt in range(1, RETRY + 1):                    try:                        r = session.get(pdf_url, headers=headers, timeout=TIMEOUT, allow_redirects=True)                        content = r.content                        if looks_like_pdf(content):                            out_path.parent.mkdir(parents=True, exist_ok=True)                            out_path.write_bytes(content)                            ok = True                            break                        else:                            if args.debug:                                print(f"❌ 非 PDF（尝试{attempt}/{RETRY}）：{pdf_url} | "                                      f"status={r.status_code} | ct={r.headers.get('Content-Type')} | "                                      f"head={content[:200]!r}")                    except requests.RequestException as e:                        if args.debug:                            print(f"❌ 请求异常（尝试{attempt}/{RETRY}）：{pdf_url} | {e}")                    time.sleep(0.8)
                # 失败再试：若当前来源于 infoCode 猜测，则去详情页解析；反之亦然                if not ok and item.detail_url:                    alt = get_pdf_url_from_detail(item.detail_url)                    if alt and alt != pdf_url:                        try:                            r = session.get(alt, headers={"Referer": item.detail_url,                                                          "Accept":"application/pdf,*/*;q=0.9",                                                          "Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"},                                            timeout=TIMEOUT, allow_redirects=True)                            content = r.content                            if looks_like_pdf(content):                                out_path.parent.mkdir(parents=True, exist_ok=True)                                out_path.write_bytes(content)                                ok = True                            elif args.debug:                                print(f"❌ 备用链接仍非 PDF：{alt} | status={r.status_code} | "                                      f"ct={r.headers.get('Content-Type')} | head={content[:200]!r}")                        except requests.RequestException as e:                            if args.debug:                                print(f"❌ 备用链接请求异常：{alt} | {e}")
                if not ok:                    print(f"❌ 下载失败：{item.title} | {pdf_url}")                    continue
            file_hash = sha256_file(out_path)            write_manifest(manifest, {                "ts": datetime.now(tz.tzlocal()).isoformat(),                "title": item.title,                "org": item.org,                "industry": item.industry,                "stock": item.stock,                "publishDate": item.publish_date,                "detailUrl": item.detail_url or "",                "pdfUrl": pdf_url,                "savedAs": str(out_path),                "sha256": file_hash,            })
        time.sleep(args.sleep)
if __name__ == "__main__":    main()

解析

该脚本为东方财富研报批量下载pdf文件脚本。从东方财富研报接口按时间范围分页抓取研报清单，稳健地定位并批量下载 PDF 到本地：

PDF 获取优先级：接口 pdfUrl → 详情页解析（a/iframe/embed/正则）→ 由 infoCode 组装兜底链接。
断点续传/去重：已存在且有效的 PDF 跳过；小文件或非 PDF 自动重下。
反爬与可靠性：带 Referer、超时与重试、随机 JSONP 回调名；判定 PDF 头而非仅看扩展名。
输出 manifest.csv（含时间戳、标题、机构、行业、保存路径、SHA256 校验）。
提供命令行参数（时间、分页大小、股票/行业/机构过滤、输出目录、sleep、debug）。

主要方法

sanitize(name)
清洗文件名中非法字符，避免保存路径报错。
jsonp_to_json(text)
解析东方财富 JSONP 响应为 JSON 对象。
build_params(begin, end, page_no, page_size, qtype, code, industry_code, org_code)
构造 /report/list 查询参数（含随机 cb 回调名）。
fetch_jsonp(params)
请求清单接口（带重试与超时），返回解析后的 JSON。
parse_item(raw) → ReportItem
规范化一条研报记录（标题、机构、行业、股票、发布日期、详情页、pdfUrl、infoCode）。
extract_pdf_from_html(html)
从详情页 HTML 多通道提取 PDF 链接：

全文二进制正则匹配 pdf.dfcfw.com；2) DOM 扫描 a[href]、iframe[src]、embed[src]。

get_pdf_url_from_detail(detail_url)
拉取详情页并调用上面解析器，带重试。
pick_pdf_and_referer(item)
确定下载用的 PDF 链接与 Referer：优先接口给的；否则详情页解析；再不行用 infoCode 拼兜底 URL。
looks_like_pdf(content)
有效性判定：2KB 头部含 %PDF- 且前 4KB 非 HTML，且长度 ≥ MIN_VALID_PDF。
needs_redownload(path)
本地文件检查：不存在/空文件/头部不像 PDF → 需要重下。
sha256_file(path)
计算已保存文件的 SHA256，用于清单记录与校验。
make_outpath(item, out_root)
以日期为子目录，文件名由「日期-行业/股票-标题-机构」组成，返回最终保存路径。
write_manifest(csv_path, row)
追加写入 manifest.csv，首写包含表头。
main()
解析命令行参数 → 拉取第 1 页拿分页信息 → 逐页遍历：

为每条记录决定 PDF 链接与 Referer；
若需下载则带 Referer 拉取，失败在 debug 下打印诊断并尝试备用链接；
成功后保存、计算哈希并写 manifest；
页面与请求间按 --sleep 间隔节流。

注意：

本文部分变量已做脱敏处理，仅用于测试和学习研究，禁止用于商业用途，不能保证其合法性，准确性，完整性和有效性，请根据情况自行判断。技术层面需要提供帮助，可以通过打赏的方式进行探讨。

历史脚本txt文件获取>>

服务器搭建，人工服务咨询>>

网赚：日进千刀

2025年10月5日星期日

东方财富研报任务脚本

1.购买服务器阿里云：服务器购买地址https://t.aliyun.com/U/E8o0aM若失效，可用地址

主要方法

没有评论:

发表评论

东方财富研报任务脚本