python文本清理

import re
from bs4 import BeautifulSoup

def cleartxt():
# 读取 txt 文件
with open(r"C:\Users\admin\Desktop\2\fengshui_ok.txt", "r",encoding="utf8") as f:
text = f.read()

# BeautifulSoup 库删除 HTML 代码
soup = BeautifulSoup(text, "html.parser")
text = soup.get_text()

# 用正则表达式删除标点符号
text = re.sub(r"[^\w\s]", "", text)

# 保存清理后的文本
with open(r"C:\Users\admin\Desktop\2\cleaned_file.txt", "w",encoding="utf8") as f:
f.write(text)
© 版权声明
THE END
喜欢就支持一下吧
点赞9赞赏 分享