noone的爬虫-精通汽修

无鉴权能下载pdf，但仍存在对用户身份的鉴权，所以需要vip的cookie才行

访问 http://www.jt269.com/view-xxxx.html 的时候会返回一大串目录，其中有一行以 <li class="pdf-dir"> 开头的就是文件目录与文件名

用栈维护一下当前目录：读取到 <li class="pdf-dir"> 的时候，证明其后面跟的是目录名，将  标签中的内容入栈，注意忽略 <ul style="display:none">；当再读取到类似的就继续入栈；当读取到 <li class="pdf-end hasview viptip" exetn=".pdf"> 表明其后面跟的是文件名，将  目录中的文件名和现有的栈中的所有目录名拼接起来得到这个文件的完整目录；当读取到</li>时，表明退出这个目录，即将最后入栈的目录名弹出

然后让ai慢慢调，注意有的文件名是 xxx.pdf，有的是xxx%20.pdf

beta1.0:

import os
import requests
from bs4 import BeautifulSoup
import time
import subprocess
import re

def download_file(url, save_path):
    print(f"Downloading file from URL: {url}")  # 打印下载的URL路径
    
    # 确保目录存在
    dir_path = os.path.dirname(save_path)
    if dir_path and not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
        "Cookie": [这里要改vip的cookie，愤怒喵]
    }
    
    response = requests.get(url, stream=True, headers=headers)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        
        # 检查文件大小
        file_size = os.path.getsize(save_path)
        print(f"Downloaded: {save_path} (Size: {file_size} bytes)")
        return file_size > 0  # 返回下载是否成功
    else:
        print(f"Failed to download: {url}")
        return False

def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def log_response(content, log_file="response_log.txt"):
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(content + "\n")

def extract_directory_tree(html_content):
    """从HTML内容中提取目录树结构"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 查找第一个以 <li class="pdf-dir"><i class="fa fa-folder"></i><span> 开头的行
    tree_start = soup.find('li', class_='pdf-dir')
    if tree_start:
        # 提取整个目录树结构
        return str(tree_start)
    return None

def parse_result_and_download(base_url, result_file="result.html"):
    """解析result.html并下载所有文件"""
    if not os.path.exists(result_file):
        print(f"Result file {result_file} not found!")
        return
    
    with open(result_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 提取所有文件路径
    soup = BeautifulSoup(content, 'html.parser')
    lines = content.split('\n')
    
    for line in lines:
        line = line.strip()
        if line and not line.startswith('<') and not line.endswith('>') and '@' in line:
            # 解析文件路径
            # 格式: 2024年捷途山海T2@01 维修手册@01 发动机机械@02 SQRH4J15 发动机机械系统@00 目录.pdf-8363
            parts = line.split('@')
            if len(parts) < 2:
                continue
            
            # 最后一部分包含文件名和ID
            last_part = parts[-1]
            if '.pdf-' in last_part:
                file_id = last_part.split('.pdf-')[-1]
                file_name = last_part.replace(f'.pdf-{file_id}', '.pdf')
                
                # 构造文件夹路径
                folder_parts = parts[:-1] + [file_name.replace('.pdf', '')]
                folder_path = os.path.join(*folder_parts)
                
                # 构造文件完整路径 - 保存在当前脚本目录下
                current_dir = os.path.dirname(os.path.abspath(__file__))
                file_path = os.path.join(current_dir, *parts[:-1], file_name)
                
                # 构造下载URL
                download_url = f"{base_url}/{line}"
                
                print(f"Downloading: {file_name}")
                print(f"URL: {download_url}")
                print(f"Save to: {file_path}")
                
                # 尝试下载文件
                success = download_file(download_url, file_path)
                
                # 如果下载失败或文件为0KB，尝试在.pdf前面加空格
                if not success:
                    print("First attempt failed, trying with space before .pdf...")
                    # 在.pdf前面加一个空格
                    modified_line = line.replace('.pdf-', ' .pdf-')
                    retry_download_url = f"{base_url}/{modified_line}"
                    
                    print(f"Retry URL: {retry_download_url}")
                    success = download_file(retry_download_url, file_path)
                    
                    if success:
                        print("Retry download successful!")
                    else:
                        print("Both attempts failed!")
                
                time.sleep(1)  # 添加1秒的间隔

def parse_and_download(base_url, view_id):
    # 构造请求 URL
    url = f"{base_url}/view-{view_id}.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Referer": f"{base_url}/search-0-1-1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Cookie": "ds_session=60evcvoaee7cpkjp02ob5vmhs6p4t3s6; _d_id=c23721e2a52b135471099b9b44f3cc"
    }

    # 发送请求
    response = requests.get(url, headers=headers)
    print(f"Request sent to URL: {url}")  # 打印发送的请求URL
    if response.status_code != 200:
        print(f"Failed to fetch page: {url}")
        return

    # 解析 HTML 内容
    soup = BeautifulSoup(response.text, 'html.parser')
    print("Response received, parsing HTML...")  # 打印接收到的响应状态
    log_response(response.text)  # 将响应内容记录到文件中

    # 提取目录树结构
    tree_html = extract_directory_tree(response.text)
    if not tree_html:
        print("No directory tree found on the page.")
        return

    # 保存到 example.html
    current_dir = os.path.dirname(os.path.abspath(__file__))
    example_file = os.path.join(current_dir, "example.html")
    with open(example_file, 'w', encoding='utf-8') as f:
        f.write(f"<ul>\n{tree_html}\n</ul>")
    
    print(f"Directory tree saved to {example_file}")

    # 调用 analyze_html.py 进行分析
    print("Running analyze_html.py...")
    try:
        # 获取当前脚本所在目录
        current_dir = os.path.dirname(os.path.abspath(__file__))
        analyze_script = os.path.join(current_dir, 'analyze_html.py')
        
        result = subprocess.run(['python', analyze_script, view_id], 
                              capture_output=True, text=True, encoding='utf-8', 
                              cwd=current_dir)
        if result.returncode == 0:
            print("analyze_html.py executed successfully")
            print(result.stdout)
        else:
            print("Error running analyze_html.py:")
            print(result.stderr)
            return
    except Exception as e:
        print(f"Failed to run analyze_html.py: {e}")
        return

    # 解析结果并下载文件
    print("Starting to download files based on result.html...")
    current_dir = os.path.dirname(os.path.abspath(__file__))
    result_file = os.path.join(current_dir, "result.html")
    parse_result_and_download(base_url, result_file)

if __name__ == "__main__":
    base_url = "http://www.jt269.com"
    view_id = input("Enter the view ID (e.g., 8363): ")
    parse_and_download(base_url, view_id)

analyze_html.py:

from bs4 import BeautifulSoup
import sys

def analyze_html(input_file, output_file, view_id="8363"):
    # 读取 HTML 文件
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # 解析 HTML 内容
    soup = BeautifulSoup(content, 'html.parser')

    # 查找根 li 元素（2024年捷途山海T2）
    root_li = soup.find('li', class_='pdf-dir')
    
    result = []
    
    def parse_directory(ul_element, current_path=[]):
        """递归解析目录结构"""
        if ul_element is None:
            return
            
        # 查找所有直接子 li 元素
        for li in ul_element.find_all('li', recursive=False):
            span = li.find('span')
            if span is None:
                continue
                
            item_name = span.text.strip()
            
            if 'pdf-dir' in li.get('class', []):
                # 这是一个目录
                new_path = current_path + [item_name]
                print(f"进入目录: {'/'.join(new_path)}")
                
                # 查找子 ul 元素
                child_ul = li.find('ul')
                if child_ul:
                    parse_directory(child_ul, new_path)
                    
            elif 'pdf-end' in li.get('class', []):
                # 这是一个文件
                file_path = current_path + [item_name]
                full_path = '@'.join(file_path)
                full_path= f"{full_path}.pdf-{view_id}"
                result.append(full_path)
                print(f"找到文件: {'/'.join(file_path)}")

    # 从根 li 元素开始解析
    if root_li:
        root_span = root_li.find('span')
        if root_span:
            root_name = root_span.text.strip()
            print(f"根目录: {root_name}")
            
            # 查找根目录下的 ul 元素
            root_ul = root_li.find('ul')
            if root_ul:
                parse_directory(root_ul, [root_name])

    # 构造结果 HTML
    result_html = "<html><body><ul>\n"
    for path in result:
        result_html += f"{path}\n"
    # result_html += "</ul></body></html>"

    # 移除多余的空标签
    result_html = result_html.replace("<ul>\n</ul>\n", "")

    # 写入结果文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(result_html)

    print(f"Directory structure has been written to {output_file}")
    print(f"Total files found: {len(result)}")

if __name__ == "__main__":
    input_file = "C:\\post\\精通汽修爬虫\\example.html"
    output_file = "C:\\post\\精通汽修爬虫\\result.html"
    view_id = sys.argv[1] if len(sys.argv) > 1 else "8363"
    analyze_html(input_file, output_file, view_id)