import re
from bs4 import BeautifulSoup
# 删除长度大于 n 的行
def delcd(intxt,outtxt):
# 读取文件
with open(intxt, "r",encoding="utf8") as f:
lines = f.readlines()
# 大于6个字的行并且小于30的行保留
lines = [line for line in lines if 6 < len(line) < 30]
# 写入文件
with open(outtxt, "w",encoding="utf8") as f:
f.writelines(lines)
#去重行相同
def quchong(outtxt,qctxt):
fi = open(outtxt, 'r',encoding="utf8") # 打开需要处理的test.txt。
txt = fi.readlines()
with open(qctxt, 'a',encoding="utf8") as f: # 创建处理去重复后的结果保存文档,防止找不到文件出错
f.close()
for w in txt:
fi2 = open(qctxt, 'r',encoding="utf8")
txt2 = fi2.readlines()
with open(qctxt, 'a',encoding="utf8") as f: # 打开目标文件开始写入
if w not in txt2: # 如果从源文档中读取的内容不在目标文档中则写入,否则跳过,实现去除重复功能!
f.write(w)
else:
print("已去除重复-->" + w)
f.close()
fi.close()
#清理html和符号、替换内容
def cleartxt(qctxt,qltxt):
# 读取 txt 文件
with open(qctxt, "r",encoding="utf8") as f:
text = f.read()
#替换内容
text = re.sub(r'鼠', '兔', text)
text = re.sub(r'牛', '兔', text)
text = re.sub(r'虎', '兔', text)
text = re.sub(r'兔', '兔', text)
text = re.sub(r'龙', '兔', text)
text = re.sub(r'蛇', '兔', text)
text = re.sub(r'马', '兔', text)
text = re.sub(r'羊', '兔', text)
text = re.sub(r'猴', '兔', text)
text = re.sub(r'鸡', '兔', text)
text = re.sub(r'狗', '兔', text)
text = re.sub(r'猪', '兔', text)
# 用 BeautifulSoup 库删除 HTML 代码
soup = BeautifulSoup(text, "html.parser")
text = soup.get_text()
# 用正则表达式删除标点符号
text = re.sub(r"[^\w\s]", "", text)
# 保存清理后的文本
with open(qltxt, "w",encoding="utf8") as f:
f.write(text)
if __name__ == '__main__':
intxt = r"C:\Users\admin\Desktop\0\newwords.txt"#原始文件
outtxt = r"C:\Users\admin\Desktop\0\1.txt" # # 大于6个字的行并且小于30字的行保留
qctxt = r"C:\Users\admin\Desktop\0\2.txt" # 去重相同行后保存
qltxt = r"C:\Users\admin\Desktop\0\3.txt" # 清理html和符号后保存
delcd(intxt, outtxt)
quchong(outtxt,qctxt)
cleartxt(qctxt, qltxt)
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END