MissAV 全自动采集与转存系统部署教程

系统功能：

全自动监控：监控指定的分类/厂商/女优页面。
智能过盾：使用 Flaresolverr + Python 混合双打，自动绕过 Cloudflare。
自动翻页：采集完第一页后，自动寻找下一页继续采集。
去重下载：自动比对历史记录，跳过已下载视频。
速度限制：支持限制下载和上传带宽，防止跑满服务器流量。
自动上传：下载完成后自动上传至 OneDrive 并清理本地文件。

第一步：基础环境安装

请以 root 用户登录您的 Ubuntu 服务器。

1. 更新系统并安装基础工具

我们需要 FFmpeg (用于视频合并) 和 Python 环境。

1 2	apt update && apt upgrade -y apt install python3 python3-pip python3-venv ffmpeg git curl -y

2. 安装 Rclone (用于上传 OneDrive)

# 安装脚本
sudo -v ; curl https://rclone.org/install.sh | sudo bash

# 配置 Rclone (如果您之前没配置过)
# 请按照提示新建一个 remote，命名为 "onedrive" (或者修改脚本里的名称)
rclone config

3. 安装 Docker (用于运行 Flaresolverr)

1
2
3

curl -fsSL https://get.docker.com | bash
systemctl start docker
systemctl enable docker

第二步：部署过盾服务 (Flaresolverr)

这是一个专门用来欺骗 Cloudflare 的代理服务，必须常驻后台运行。

# 拉取并启动容器 (映射端口 8191)
docker run -d \
  --name=flaresolverr \
  -p 8191:8191 \
  -e LOG_LEVEL=info \
  --restart unless-stopped \
  ghcr.io/flaresolverr/flaresolverr:latest

# 检查是否运行成功 (状态应为 Up)
docker ps

第三步：配置 Python 虚拟环境

为了防止环境冲突，我们将所有依赖安装在一个独立的虚拟环境中。

# 1. 创建虚拟环境目录
python3 -m venv /root/dl_env

# 2. 激活环境
source /root/dl_env/bin/activate

# 3. 安装必要的 Python 库
# requests: 发送请求
# jsbeautifier: 解密混淆代码 (关键)
# yt-dlp: 下载视频 (比 apt 安装的版本更新更好用)
pip install requests jsbeautifier yt-dlp

第四步：部署核心监控脚本

这是系统的核心大脑。请创建文件 /root/monitor_flare.py。

功能亮点： 包含自动翻页、JS解密、速度限制。

1	nano /root/monitor_flare.py

**⚠️ 请将以下代码粘贴进去，并根据顶部的“配置区域”修改您的目标网址：

import requests
import json
import re
import sys
import subprocess
import os
import urllib.parse
import jsbeautifier
import time
import random

# ================= ⚙️ 配置区域 =================
# 1. 目标页面 (起始页，脚本会从这一页开始自动往后翻)
# 示例: "https://missav.ws/zh/genres/uncensored-leak" 或 "https://missav.ws/dm143/genres/潮吹"
TARGET_PAGE = "https://missav.ws/dm143/genres/潮吹" 

# 2. 速度限制 (设置为 "0" 则不限制)
# 下载限速 (例如: "10M" = 10MB/s, "500K" = 500KB/s)
DOWNLOAD_LIMIT = "10M"
# 上传限速 (例如: "5M")
UPLOAD_LIMIT = "0"

# 3. Rclone 配置 (需与 rclone config 中的 Name 一致)
RCLONE_REMOTE = "onedrive"
REMOTE_DIR = "Videos/MissAV"

# 4. 翻页限制 (0 表示无限翻页，直到最后一页)
MAX_PAGES = 0 

# 5. 路径配置 (通常无需修改)
WORK_DIR = "/root"
ARCHIVE_FILE = f"{WORK_DIR}/downloaded_history.txt"
YTDLP_PATH = "/root/dl_env/bin/yt-dlp"
FLARESOLVERR_URL = "http://localhost:8191/v1"
# ===============================================

def get_page_via_flaresolverr(url):
    print(f">>> [监控] 正在扫描页面: {url}")
    headers = {"Content-Type": "application/json"}
    payload = {
        "cmd": "request.get",
        "url": url,
        "maxTimeout": 60000
    }
    try:
        response = requests.post(FLARESOLVERR_URL, json=payload, headers=headers)
        data = response.json()
        if data.get("status") == "ok":
            return data["solution"]
        else:
            print(f">>> Flaresolverr 报错: {data.get('message')}")
            return None
    except Exception as e:
        print(f">>> 连接 Flaresolverr 失败: {e}")
        return None

def extract_next_page(html_source):
    """提取下一页链接"""
    match = re.search(r'<a[^>]+href="([^"]+)"[^>]*rel="next"', html_source)
    if not match:
        match = re.search(r'<a[^>]+rel="next"[^>]*href="([^"]+)"', html_source)
    
    if match:
        next_url = match.group(1)
        if not next_url.startswith("http"):
            if next_url.startswith("/"):
                return f"https://missav.ws{next_url}"
            else:
                return f"https://missav.ws/{next_url}"
        return next_url
    return None

def extract_video_links(html_source):
    """提取视频链接"""
    links = set()
    pattern = r'href="([^"]*?\/[\w]+-[\d]+)"'
    matches = re.findall(pattern, html_source)
    
    for link in matches:
        if not link.startswith("http"):
            full_link = f"https://missav.ws{link}" if link.startswith("/") else f"https://missav.ws/{link}"
        else:
            full_link = link
        if any(x in full_link for x in ["/genres/", "/makers/", "/actresses/", "/search", "/login", "label", "series"]):
            continue
        links.add(full_link)
    return list(links)

def is_downloaded(url):
    if not os.path.exists(ARCHIVE_FILE): return False
    video_id = url.rstrip('/').split("/")[-1]
    with open(ARCHIVE_FILE, 'r') as f:
        content = f.read()
        if video_id in content: return True
    return False

def mark_as_downloaded(url):
    video_id = url.rstrip('/').split("/")[-1]
    with open(ARCHIVE_FILE, 'a') as f:
        f.write(f"{video_id}\n")

def clean_url(raw_url):
    url = raw_url.replace(r'\/', '/').replace('%3A', ':').replace('%2F', '/')
    return urllib.parse.unquote(url).replace('\\', '')

def extract_m3u8(html_source):
    """解密并提取 M3U8"""
    scripts = re.findall(r'<script[^>]*>(.*?)</script>', html_source, re.DOTALL | re.IGNORECASE)
    for script in scripts:
        if "eval(function(p,a,c,k,e,d)" in script:
            try:
                unpacked = jsbeautifier.beautify(script)
                match = re.search(r'(https?:\/\/[^"\';\s]+\.m3u8[^"\';\s]*)', unpacked)
                if match: return clean_url(match.group(1))
                match = re.search(r'(https?:\/\/[^"\';\s]*surrit\.com[^"\';\s]*)', unpacked)
                if match: return clean_url(match.group(1))
            except: pass
    
    match = re.search(r'(https?:\/\/[^\s"\'<>]*(?:surrit\.com|sixyik\.com|missav)[^\s"\'<>]*)', html_source)
    if match and "m3u8" in match.group(1):
        return clean_url(match.group(1))
    return None

def save_cookies(cookies):
    cookie_file = "monitor_cookies.txt"
    with open(cookie_file, 'w') as f:
        f.write("# Netscape HTTP Cookie File\n")
        for cookie in cookies:
            domain = cookie.get('domain', '')
            if not domain.startswith('.') and re.search(r'[a-zA-Z]', domain): domain = '.' + domain
            f.write(f"{domain}\tTRUE\t{cookie.get('path','/')}\t{'TRUE' if cookie.get('secure') else 'FALSE'}\t{cookie.get('expiry',0)}\t{cookie.get('name','')}\t{cookie.get('value','')}\n")
    return cookie_file

def process_video(url):
    print(f"\n>>> [处理中] {url}")
    solution = get_page_via_flaresolverr(url)
    if not solution: return
    
    m3u8_url = extract_m3u8(solution["response"])
    if not m3u8_url:
        print(f">>> ❌ 跳过：无法提取 M3U8")
        return

    video_id = url.rstrip('/').split("/")[-1]
    filename = f"{video_id}.mp4"
    cookie_file = save_cookies(solution["cookies"])
    
    cmd = [
        YTDLP_PATH,
        m3u8_url,
        "--output", filename,
        "--no-part",
        "--user-agent", solution["userAgent"],
        "--cookies", cookie_file
    ]
    
    if DOWNLOAD_LIMIT != "0":
        cmd.extend(["--limit-rate", DOWNLOAD_LIMIT])
    
    print(f">>> 🚀 开始下载 (限速: {DOWNLOAD_LIMIT}): {filename}")
    try:
        subprocess.run(cmd, check=True)
        print(">>> ✅ 下载完成，上传 OneDrive...")
        
        rclone_cmd = ["rclone", "move", filename, f"{RCLONE_REMOTE}:{REMOTE_DIR}", "--progress"]
        if UPLOAD_LIMIT != "0":
            rclone_cmd.extend(["--bwlimit", UPLOAD_LIMIT])
            
        subprocess.run(rclone_cmd, check=True)
        mark_as_downloaded(url)
        print(">>> ✨ 处理完成！")
    except subprocess.CalledProcessError:
        print(">>> ❌ 下载或上传失败")
    finally:
        if os.path.exists(cookie_file): os.remove(cookie_file)

if __name__ == "__main__":
    current_url = TARGET_PAGE
    page_count = 0
    
    while current_url:
        page_count += 1
        print(f"\n{'='*20} 正在处理第 {page_count} 页 {'='*20}")
        
        page_solution = get_page_via_flaresolverr(current_url)
        if not page_solution:
            print(">>> 页面获取失败，停止翻页。")
            break
            
        html = page_solution["response"]
        video_links = extract_video_links(html)
        print(f">>> 本页发现 {len(video_links)} 个视频")
        
        new_downloads = 0
        for link in video_links:
            if is_downloaded(link):
                print(f">>> [已跳过] 历史已下载: {link}")
                continue
            
            process_video(link)
            new_downloads += 1
            time.sleep(5)
            
        print(f">>> 第 {page_count} 页处理完成，共下载 {new_downloads} 个新视频。")
        
        if MAX_PAGES > 0 and page_count >= MAX_PAGES:
            print(f">>> 达到最大翻页限制 ({MAX_PAGES})，任务结束。")
            break
            
        next_page = extract_next_page(html)
        if next_page:
            print(f">>> 🔎 发现下一页: {next_page}")
            current_url = next_page
            wait_time = random.randint(10, 20)
            print(f">>> 休息 {wait_time} 秒后继续...")
            time.sleep(wait_time)
        else:
            print(">>> ✅ 没有发现下一页，所有页面采集完毕！")
            current_url = None

第五步：运行与自动化

1. 手动测试运行

建议先手动跑一下，看是否能正常下载第一个视频。

# 激活环境
source /root/dl_env/bin/activate
# 运行
python3 /root/monitor_flare.py

2. 后台长期运行 (推荐)

使用 nohup 让脚本在后台默默工作，即使断开 SSH 也不受影响。

1	nohup /root/dl_env/bin/python3 /root/monitor_flare.py > /root/monitor.log 2>&1 &

查看进度命令：tail -f /root/monitor.log
停止运行命令：pkill -f monitor_flare.py

3. 设置定时任务 (可选)

如果您希望脚本挂了能自动重启，或者每天定时检查更新：

输入 crontab -e

添加以下内容（例如每 12 小时检查一次）：

1	0 /12 * * /root/dl_env/bin/python3 /root/monitor_flare.py >> /root/monitor.log 2>&1

💡 常见问题排查

Flaresolverr 报错：
- 检查 Docker 容器是否活着：docker ps。
- 如果挂了，用 docker restart flaresolverr 重启。
找不到 m3u8 链接：
- 通常是网站改版了，或者 Cloudflare 盾升级了。请检查 monitor.log 中的报错。
下载速度慢：
- 检查脚本里的 DOWNLOAD_LIMIT 是否设置得太低。
- 也可能是 MissAV 免费用户的服务器本身就慢，多线程也救不了。
空间不足：
- 脚本设定是下载完一个删除一个，正常不会占满硬盘。
- 如果中断了任务，运行 rm *.part *.mp4 清理残留文件。