|
|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Markdown to Word Converter (简化版)
- 不依赖外部库的版本,适用于网络受限环境
- 功能:
- 1. 将Markdown转换为格式化的纯文本
- 2. 生成HTML版本(可复制粘贴到Word)
- 3. 不需要安装任何外部库
- """
- import os
- import sys
- import re
- from pathlib import Path
- def markdown_to_text(md_content):
- """将Markdown内容转换为格式化的纯文本"""
- lines = md_content.split('\n')
- text_lines = []
-
- for line in lines:
- line = line.strip()
-
- if not line:
- text_lines.append('')
- continue
-
- # 处理标题
- if line.startswith('#'):
- level = len(line) - len(line.lstrip('#'))
- title_text = line.lstrip('#').strip()
-
- # 根据级别添加不同的格式
- if level == 1:
- text_lines.append('=' * 50)
- text_lines.append(f"【主标题】 {title_text}")
- text_lines.append('=' * 50)
- elif level == 2:
- text_lines.append('-' * 40)
- text_lines.append(f"【二级标题】 {title_text}")
- text_lines.append('-' * 40)
- elif level == 3:
- text_lines.append(f"【三级标题】 {title_text}")
- text_lines.append('-' * 20)
- else:
- text_lines.append(f"{' ' * (level-1)}● {title_text}")
- continue
-
- # 处理列表项
- if line.startswith(('- ', '* ', '+ ')):
- list_text = line[2:].strip()
- # 移除markdown格式符号
- list_text = clean_markdown_formatting(list_text)
- text_lines.append(f" • {list_text}")
- continue
-
- # 处理编号列表
- if re.match(r'^\d+\.', line):
- list_text = re.sub(r'^\d+\.\s*', '', line)
- list_text = clean_markdown_formatting(list_text)
- number = re.match(r'^(\d+)\.', line).group(1)
- text_lines.append(f" {number}. {list_text}")
- continue
-
- # 处理代码块
- if line.startswith('```'):
- if '```' in line and len(line) > 3:
- # 单行代码块
- code = line.replace('```', '').strip()
- text_lines.append(f"【代码】 {code}")
- else:
- text_lines.append('【代码块开始】')
- continue
-
- # 处理表格
- if '|' in line and line.count('|') >= 2:
- cells = [cell.strip() for cell in line.split('|')[1:-1]]
- if cells:
- table_line = ' | '.join(cells)
- text_lines.append(f"【表格】 {table_line}")
- continue
-
- # 处理普通段落
- if line:
- clean_text = clean_markdown_formatting(line)
- text_lines.append(clean_text)
-
- return '\n'.join(text_lines)
- def markdown_to_html(md_content):
- """将Markdown内容转换为简单的HTML"""
- lines = md_content.split('\n')
- html_lines = []
-
- html_lines.append('''<!DOCTYPE html>
- <html>
- <head>
- <meta charset="UTF-8">
- <title>Markdown转换结果</title>
- <style>
- body { font-family: Arial, "Microsoft YaHei", sans-serif; line-height: 1.6; margin: 40px; }
- h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }
- h2 { color: #34495e; border-bottom: 2px solid #bdc3c7; padding-bottom: 8px; }
- h3 { color: #7f8c8d; }
- ul, ol { margin: 10px 0; padding-left: 30px; }
- li { margin: 5px 0; }
- code { background-color: #f8f8f8; padding: 2px 5px; border-radius: 3px; }
- pre { background-color: #f8f8f8; padding: 15px; border-radius: 5px; overflow-x: auto; }
- table { border-collapse: collapse; width: 100%; margin: 20px 0; }
- th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
- th { background-color: #f2f2f2; }
- strong { color: #2c3e50; }
- em { color: #e74c3c; }
- .warning { color: #e67e22; font-weight: bold; }
- </style>
- </head>
- <body>''')
-
- in_code_block = False
- code_lines = []
-
- for line in lines:
- original_line = line
- line = line.strip()
-
- if not line and not in_code_block:
- html_lines.append('<br>')
- continue
-
- # 处理代码块
- if line.startswith('```'):
- if in_code_block:
- # 结束代码块
- html_lines.append('<pre><code>')
- html_lines.extend(code_lines)
- html_lines.append('</code></pre>')
- code_lines = []
- in_code_block = False
- else:
- # 开始代码块
- in_code_block = True
- continue
-
- if in_code_block:
- code_lines.append(html_escape(original_line))
- continue
-
- # 处理标题
- if line.startswith('#'):
- level = len(line) - len(line.lstrip('#'))
- title_text = line.lstrip('#').strip()
- title_text = apply_html_formatting(title_text)
-
- if level <= 6:
- html_lines.append(f'<h{level}>{title_text}</h{level}>')
- else:
- html_lines.append(f'<h6>{title_text}</h6>')
- continue
-
- # 处理列表项
- if line.startswith(('- ', '* ', '+ ')):
- list_text = line[2:].strip()
- list_text = apply_html_formatting(list_text)
- html_lines.append(f'<ul><li>{list_text}</li></ul>')
- continue
-
- # 处理编号列表
- if re.match(r'^\d+\.', line):
- list_text = re.sub(r'^\d+\.\s*', '', line)
- list_text = apply_html_formatting(list_text)
- html_lines.append(f'<ol><li>{list_text}</li></ol>')
- continue
-
- # 处理表格
- if '|' in line and line.count('|') >= 2:
- cells = [cell.strip() for cell in line.split('|')[1:-1]]
- if cells:
- html_cells = [f'<td>{apply_html_formatting(cell)}</td>' for cell in cells]
- html_lines.append(f'<table><tr>{"".join(html_cells)}</tr></table>')
- continue
-
- # 处理普通段落
- if line:
- formatted_text = apply_html_formatting(line)
- html_lines.append(f'<p>{formatted_text}</p>')
-
- html_lines.append('</body></html>')
- return '\n'.join(html_lines)
- def clean_markdown_formatting(text):
- """清理markdown格式符号"""
- # 移除粗体
- text = re.sub(r'\*\*(.*?)\*\*', r'【\1】', text)
- # 移除斜体
- text = re.sub(r'\*(.*?)\*', r'\1', text)
- # 移除代码标记
- text = re.sub(r'`(.*?)`', r'【代码:\1】', text)
- # 移除链接,保留文本
- text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
- # 保留警告符号
- text = text.replace('⚠️', '【警告】')
-
- return text
- def apply_html_formatting(text):
- """应用HTML格式"""
- # HTML转义
- text = html_escape(text)
-
- # 粗体
- text = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', text)
- # 斜体
- text = re.sub(r'\*(.*?)\*', r'<em>\1</em>', text)
- # 代码
- text = re.sub(r'`(.*?)`', r'<code>\1</code>', text)
- # 链接
- text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
- # 警告符号
- text = text.replace('⚠️', '<span class="warning">⚠️</span>')
-
- return text
- def html_escape(text):
- """HTML转义"""
- return (text.replace('&', '&')
- .replace('<', '<')
- .replace('>', '>')
- .replace('"', '"')
- .replace("'", '''))
- def main():
- """主函数"""
- # 默认文件路径
- default_md_file = r"D:\BaiduSyncdisk\api111\网站结构说明.md"
-
- # 获取输入文件路径
- if len(sys.argv) > 1:
- md_file = Path(sys.argv[1])
- else:
- md_file = Path(default_md_file)
-
- # 检查输入文件是否存在
- if not md_file.exists():
- print(f"错误: 文件不存在 - {md_file}")
- print(f"请确保文件路径正确,或者将文件路径作为参数传递给脚本")
- print(f"用法: python {sys.argv[0]} <markdown文件路径>")
- return
-
- print(f"📁 输入文件: {md_file}")
- print("-" * 50)
-
- try:
- # 读取markdown文件
- with open(md_file, 'r', encoding='utf-8') as f:
- md_content = f.read()
-
- print("✅ 文件读取成功")
-
- # 生成纯文本版本
- text_output = markdown_to_text(md_content)
- text_file = md_file.with_suffix('.txt')
-
- with open(text_file, 'w', encoding='utf-8') as f:
- f.write(text_output)
-
- print(f"📄 纯文本版本已生成: {text_file}")
-
- # 生成HTML版本
- html_output = markdown_to_html(md_content)
- html_file = md_file.with_suffix('.html')
-
- with open(html_file, 'w', encoding='utf-8') as f:
- f.write(html_output)
-
- print(f"🌐 HTML版本已生成: {html_file}")
-
- print("\n📋 使用说明:")
- print("1. 纯文本版本(.txt): 可以直接复制粘贴到任何文档")
- print("2. HTML版本(.html): 用浏览器打开,然后复制粘贴到Word保持格式")
- print(" - 在浏览器中打开生成的HTML文件")
- print(" - 按Ctrl+A全选,然后Ctrl+C复制")
- print(" - 在Word中按Ctrl+V粘贴,格式会自动保留")
-
- print(f"\n✅ 转换完成!生成了2个文件:")
- print(f" 📄 {text_file}")
- print(f" 🌐 {html_file}")
-
- except Exception as e:
- print(f"❌ 转换失败: {str(e)}")
- if __name__ == "__main__":
- main()
复制代码 |
|