1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
| import os import requests from bs4 import BeautifulSoup import time import subprocess import re
def download_file(url, save_path): print(f"Downloading file from URL: {url}") dir_path = os.path.dirname(save_path) if dir_path and not os.path.exists(dir_path): os.makedirs(dir_path) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", "Cookie": [这里要改vip的cookie,愤怒喵] } response = requests.get(url, stream=True, headers=headers) if response.status_code == 200: with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) file_size = os.path.getsize(save_path) print(f"Downloaded: {save_path} (Size: {file_size} bytes)") return file_size > 0 else: print(f"Failed to download: {url}") return False
def create_directory(path): if not os.path.exists(path): os.makedirs(path)
def log_response(content, log_file="response_log.txt"): with open(log_file, "a", encoding="utf-8") as f: f.write(content + "\n")
def extract_directory_tree(html_content): """从HTML内容中提取目录树结构""" soup = BeautifulSoup(html_content, 'html.parser') tree_start = soup.find('li', class_='pdf-dir') if tree_start: return str(tree_start) return None
def parse_result_and_download(base_url, result_file="result.html"): """解析result.html并下载所有文件""" if not os.path.exists(result_file): print(f"Result file {result_file} not found!") return with open(result_file, 'r', encoding='utf-8') as f: content = f.read() soup = BeautifulSoup(content, 'html.parser') lines = content.split('\n') for line in lines: line = line.strip() if line and not line.startswith('<') and not line.endswith('>') and '@' in line: parts = line.split('@') if len(parts) < 2: continue last_part = parts[-1] if '.pdf-' in last_part: file_id = last_part.split('.pdf-')[-1] file_name = last_part.replace(f'.pdf-{file_id}', '.pdf') folder_parts = parts[:-1] + [file_name.replace('.pdf', '')] folder_path = os.path.join(*folder_parts) current_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(current_dir, *parts[:-1], file_name) download_url = f"{base_url}/{line}" print(f"Downloading: {file_name}") print(f"URL: {download_url}") print(f"Save to: {file_path}") success = download_file(download_url, file_path) if not success: print("First attempt failed, trying with space before .pdf...") modified_line = line.replace('.pdf-', ' .pdf-') retry_download_url = f"{base_url}/{modified_line}" print(f"Retry URL: {retry_download_url}") success = download_file(retry_download_url, file_path) if success: print("Retry download successful!") else: print("Both attempts failed!") time.sleep(1)
def parse_and_download(base_url, view_id): url = f"{base_url}/view-{view_id}.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Referer": f"{base_url}/search-0-1-1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cookie": "ds_session=60evcvoaee7cpkjp02ob5vmhs6p4t3s6; _d_id=c23721e2a52b135471099b9b44f3cc" }
response = requests.get(url, headers=headers) print(f"Request sent to URL: {url}") if response.status_code != 200: print(f"Failed to fetch page: {url}") return
soup = BeautifulSoup(response.text, 'html.parser') print("Response received, parsing HTML...") log_response(response.text)
tree_html = extract_directory_tree(response.text) if not tree_html: print("No directory tree found on the page.") return
current_dir = os.path.dirname(os.path.abspath(__file__)) example_file = os.path.join(current_dir, "example.html") with open(example_file, 'w', encoding='utf-8') as f: f.write(f"<ul>\n{tree_html}\n</ul>") print(f"Directory tree saved to {example_file}")
print("Running analyze_html.py...") try: current_dir = os.path.dirname(os.path.abspath(__file__)) analyze_script = os.path.join(current_dir, 'analyze_html.py') result = subprocess.run(['python', analyze_script, view_id], capture_output=True, text=True, encoding='utf-8', cwd=current_dir) if result.returncode == 0: print("analyze_html.py executed successfully") print(result.stdout) else: print("Error running analyze_html.py:") print(result.stderr) return except Exception as e: print(f"Failed to run analyze_html.py: {e}") return
print("Starting to download files based on result.html...") current_dir = os.path.dirname(os.path.abspath(__file__)) result_file = os.path.join(current_dir, "result.html") parse_result_and_download(base_url, result_file)
if __name__ == "__main__": base_url = "http://www.jt269.com" view_id = input("Enter the view ID (e.g., 8363): ") parse_and_download(base_url, view_id)
|