最近,遇到这个问题,怎么检查已经发布的文章里面的敏感词,用百度ai文本检测功能,有25万的免费额度,在网上找了很多插件,感觉都不好用,其中要能检测和替换敏感词的功能,于是,想到用python写个源码,直接链接wordpress数据库来处理,流程:
第一,链接wp数据库,定位数据表,提取数据
# Connect to the Empire CMS database
MYSQL_URL = '103.91.210.249' # 数据库地址
MYSQL_PORT = 3306 # 数据库端口
MYSQL_USER = "qmwu_com" # 数据库用户名
MYSQL_PASSWORD = "a7sX2DcPbWbanPWw" # 数据库密码
MYSQL_DATABASE = "qmwu_com" # 数据库名称
MYSQL_CHARSET = "utf8" # 数据库字符编码
conn = pymysql.connect(host=MYSQL_URL, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD,
database=MYSQL_DATABASE, charset=MYSQL_CHARSET)
定位数据表
sql = cursor.execute("SELECT id,post_content FROM qmwu_posts WHERE ischeck = 0 order by rand() limit 1 ")
if sql == 0:
print("检测完毕!")
break
articles = cursor.fetchone()
articles = list(articles)
ids = articles[0] # 文章id
content = articles[1] # 文章内容
if content == "":
print(articles[0], "文章内容为空!跳过")
cursor.execute("UPDATE qmwu_posts SET ischeck = 1 WHERE id=%s ", ids)
continue
spam = baidu_ai_sensitive_word_detection(content)
百度ai处理
url = "https://aip.baidubce.com/rest/2.0/solution/v1/text_censor/v2/user_defined?access_token=" + get_access_token()
tx = {"text": text}
tx = urlencode(tx)
payload = tx
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request(“POST”, url, headers=headers, data=payload)
print(response.text)
result = response.json()
try:
if result[“conclusionType”] == 2: # 不合格
s = result[“data”]
s1 = s[0][‘msg’] # 违规类型
s2 = s[0][‘hits’][0][‘wordHitPositions’][0][‘keyword’] # 违规词
return s1, s2
else:
return False
spam = baidu_ai_sensitive_word_detection(content)
if not spam:
print(f"ID:{ids} 的内容合规")
cursor.execute("UPDATE qmwu_posts SET ischeck = 1 WHERE id=%s ", ids)
else:
print(f"ID:{ids} 的内容不合规")
keyword = spam[1]
content = content.replace(keyword, '***')
cursor.execute("UPDATE qmwu_posts SET post_content = %s WHERE id = %s and ischeck = 0", (content, ids))
print(spam[0] + ":替换【" + spam[1] + "】,为***")
cursor.execute("UPDATE qmwu_posts SET ischeck = 1 WHERE id=%s ", ids)
conn.commit()
# time.sleep(1) # Wait 10 minutes before checking the database again
完整代码如下
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END
暂无评论内容