mirror of
https://github.com/gnh1201/caterpillar.git
synced 2024-11-26 15:31:45 +00:00
304 lines
10 KiB
Python
304 lines
10 KiB
Python
#!/usr/bin/python3
|
|
#
|
|
# fediverse.py
|
|
# Fediverse (Mastodon, Misskey, Pleroma, ...) SPAM filter plugin for Caterpillar Proxy
|
|
#
|
|
# Caterpillar Proxy - The simple and parasitic web proxy with SPAM filter (formerly, php-httpproxy)
|
|
# Namyheon Go (Catswords Research) <abuse@catswords.net>
|
|
# https://github.com/gnh1201/caterpillar
|
|
#
|
|
# Created in: 2022-10-06
|
|
# Updated in: 2024-07-06
|
|
#
|
|
import base64
|
|
import hashlib
|
|
import io
|
|
import re
|
|
import requests
|
|
import os.path
|
|
|
|
from decouple import config
|
|
from PIL import Image
|
|
|
|
from base import Extension, Logger
|
|
|
|
logger = Logger(name="fediverse")
|
|
|
|
try:
|
|
client_encoding = config("CLIENT_ENCODING", default="utf-8")
|
|
truecaptcha_userid = config("TRUECAPTCHA_USERID") # truecaptcha.org
|
|
truecaptcha_apikey = config("TRUECAPTCHA_APIKEY") # truecaptcha.org
|
|
dictionary_file = config(
|
|
"DICTIONARY_FILE", default="words_alpha.txt"
|
|
) # https://github.com/dwyl/english-words
|
|
librey_apiurl = config(
|
|
"LIBREY_APIURL", default="https://search.catswords.net"
|
|
) # https://github.com/Ahwxorg/librey
|
|
except Exception as e:
|
|
logger.error("[*] Invalid configuration", exc_info=e)
|
|
|
|
|
|
class Fediverse(Extension):
|
|
def __init__(self):
|
|
self.type = "filter" # this is a filter
|
|
|
|
# Load data to use KnownWords4 strategy
|
|
# Download data: https://github.com/dwyl/english-words
|
|
self.known_words = []
|
|
if dictionary_file != "" and os.path.isfile(dictionary_file):
|
|
with open(dictionary_file, "r") as file:
|
|
words = file.readlines()
|
|
self.known_words = [
|
|
word.strip() for word in words if len(word.strip()) > 3
|
|
]
|
|
logger.info("[*] Data loaded to use KnownWords4 strategy")
|
|
|
|
def test(self, filtered, data, webserver, port, scheme, method, url):
|
|
# prevent cache confusing
|
|
if data.find(b"<title>Welcome to nginx!</title>") > -1:
|
|
return True
|
|
|
|
# allowed conditions
|
|
if method == b"GET" or url.find(b"/api") > -1:
|
|
return False
|
|
|
|
# convert to text
|
|
data_length = len(data)
|
|
text = data.decode(client_encoding, errors="ignore")
|
|
error_rate = (data_length - len(text)) / data_length
|
|
if error_rate > 0.2: # it is a binary data
|
|
return False
|
|
|
|
# check ID with K-Anonymity strategy
|
|
pattern = r"\b(?:(?<=\/@)|(?<=acct:))([a-zA-Z0-9]{10})\b"
|
|
matches = list(set(re.findall(pattern, text)))
|
|
if len(matches) > 0:
|
|
logger.info("[*] Found ID: %s" % (", ".join(matches)))
|
|
try:
|
|
filtered = not all(map(self.pwnedpasswords_test, matches))
|
|
except Exception as e:
|
|
logger.error("[*] K-Anonymity strategy not working!", exc_info=e)
|
|
filtered = True
|
|
|
|
# feedback
|
|
if filtered and len(matches) > 0:
|
|
score = 0
|
|
strategies = []
|
|
|
|
# check ID with VowelRatio10 strategy
|
|
def vowel_ratio_test(s):
|
|
ratio = self.calculate_vowel_ratio(s)
|
|
return ratio > 0.2 and ratio < 0.8
|
|
|
|
if all(map(vowel_ratio_test, matches)):
|
|
score += 1
|
|
strategies.append("VowelRatio10")
|
|
|
|
# check ID with Palindrome4 strategy
|
|
if all(map(self.has_palindrome, matches)):
|
|
score += 1
|
|
strategies.append("Palindrome4")
|
|
|
|
# check ID with KnownWords4 strategy
|
|
if all(map(self.has_known_word, matches)):
|
|
score += 2
|
|
strategies.append("KnownWords4")
|
|
|
|
# check ID with SearchEngine3 strategy
|
|
if librey_apiurl != "" and all(map(self.search_engine_test, matches)):
|
|
score += 1
|
|
strategies.append("SearchEngine3")
|
|
|
|
# check ID with RepeatedNumbers3 strategy
|
|
if all(map(self.repeated_numbers_test, matches)):
|
|
score += 1
|
|
strategies.append("RepeatedNumbers3")
|
|
|
|
# logging score
|
|
with open("score.log", "a") as file:
|
|
file.write(
|
|
"%s\t%s\t%s\r\n"
|
|
% ("+".join(matches), str(score), "+".join(strategies))
|
|
)
|
|
|
|
# make decision
|
|
if score > 1:
|
|
filtered = False
|
|
|
|
# check an attached images (check images with Not-CAPTCHA strategy)
|
|
if truecaptcha_userid != "" and not filtered and len(matches) > 0:
|
|
|
|
def webp_to_png_base64(url):
|
|
try:
|
|
response = requests.get(url)
|
|
img = Image.open(io.BytesIO(response.content))
|
|
img_png = img.convert("RGBA")
|
|
buffered = io.BytesIO()
|
|
img_png.save(buffered, format="PNG")
|
|
encoded_image = base64.b64encode(buffered.getvalue()).decode(
|
|
client_encoding
|
|
)
|
|
return encoded_image
|
|
except:
|
|
return None
|
|
|
|
urls = re.findall(r'https://[^\s"]+\.webp', text)
|
|
if len(urls) > 0:
|
|
for url in urls:
|
|
if filtered:
|
|
break
|
|
|
|
logger.info("[*] downloading... %s" % (url))
|
|
encoded_image = webp_to_png_base64(url)
|
|
logger.info("[*] downloaded.")
|
|
if encoded_image:
|
|
logger.info("[*] solving...")
|
|
try:
|
|
solved = self.truecaptcha_solve(encoded_image)
|
|
if solved:
|
|
logger.info("[*] solved: %s" % (solved))
|
|
filtered = filtered or (
|
|
solved.lower() in ["ctkpaarr", "spam"]
|
|
)
|
|
else:
|
|
logger.info("[*] not solved")
|
|
except Exception as e:
|
|
logger.error(
|
|
"[*] Not CAPTCHA strategy not working!", exc_info=e
|
|
)
|
|
|
|
return filtered
|
|
|
|
# Strategy: K-Anonymity test - use api.pwnedpasswords.com
|
|
def pwnedpasswords_test(self, s):
|
|
# convert to lowercase
|
|
s = s.lower()
|
|
|
|
# SHA1 of the password
|
|
p_sha1 = hashlib.sha1(s.encode()).hexdigest()
|
|
|
|
# First 5 char of SHA1 for k-anonymity API use
|
|
f5_sha1 = p_sha1[:5]
|
|
|
|
# Last 5 char of SHA1 to match API output
|
|
l5_sha1 = p_sha1[-5:]
|
|
|
|
# Making GET request using Requests library
|
|
response = requests.get(f"https://api.pwnedpasswords.com/range/{f5_sha1}")
|
|
|
|
# Checking if request was successful
|
|
if response.status_code == 200:
|
|
# Parsing response text
|
|
hashes = response.text.split("\r\n")
|
|
|
|
# Using list comprehension to find matching hashes
|
|
matching_hashes = [
|
|
line.split(":")[0] for line in hashes if line.endswith(l5_sha1)
|
|
]
|
|
|
|
# If there are matching hashes, return True, else return False
|
|
return bool(matching_hashes)
|
|
else:
|
|
raise Exception(
|
|
"api.pwnedpasswords.com response status: %s"
|
|
% (str(response.status_code))
|
|
)
|
|
|
|
return False
|
|
|
|
# Strategy: Not-CAPTCHA - use truecaptcha.org
|
|
def truecaptcha_solve(self, encoded_image):
|
|
url = "https://api.apitruecaptcha.org/one/gettext"
|
|
data = {
|
|
"userid": truecaptcha_userid,
|
|
"apikey": truecaptcha_apikey,
|
|
"data": encoded_image,
|
|
"mode": "human",
|
|
}
|
|
response = requests.post(url=url, json=data)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
|
|
if "error_message" in data:
|
|
print("[*] Error: %s" % (data["error_message"]))
|
|
return None
|
|
if "result" in data:
|
|
return data["result"]
|
|
else:
|
|
raise Exception(
|
|
"api.apitruecaptcha.org response status: %s"
|
|
% (str(response.status_code))
|
|
)
|
|
|
|
return None
|
|
|
|
# Strategy: VowelRatio10
|
|
def calculate_vowel_ratio(self, s):
|
|
# Calculate the length of the string.
|
|
length = len(s)
|
|
if length == 0:
|
|
return 0.0
|
|
|
|
# Count the number of vowels ('a', 'e', 'i', 'o', 'u', 'w', 'y') in the string.
|
|
vowel_count = sum(1 for char in s if char.lower() in "aeiouwy")
|
|
|
|
# Define vowel-ending patterns
|
|
vowel_ending_patterns = ["ang", "eng", "ing", "ong", "ung", "ank", "ink", "dge"]
|
|
|
|
# Count the occurrences of vowel-ending patterns in the string.
|
|
vowel_count += sum(s.count(pattern) for pattern in vowel_ending_patterns)
|
|
|
|
# Calculate the ratio of vowels to the total length of the string.
|
|
vowel_ratio = vowel_count / length
|
|
|
|
return vowel_ratio
|
|
|
|
# Strategy: Palindrome4
|
|
def has_palindrome(self, input_string):
|
|
def is_palindrome(s):
|
|
return s == s[::-1]
|
|
|
|
input_string = input_string.lower()
|
|
n = len(input_string)
|
|
for i in range(n):
|
|
for j in range(i + 4, n + 1): # Find substrings of at least 5 characters
|
|
substring = input_string[i:j]
|
|
if is_palindrome(substring):
|
|
return True
|
|
return False
|
|
|
|
# Strategy: KnownWords4
|
|
def has_known_word(self, input_string):
|
|
def is_known_word(s):
|
|
return s in self.known_words
|
|
|
|
input_string = input_string.lower()
|
|
n = len(input_string)
|
|
for i in range(n):
|
|
for j in range(i + 4, n + 1): # Find substrings of at least 5 characters
|
|
substring = input_string[i:j]
|
|
if is_known_word(substring):
|
|
return True
|
|
return False
|
|
|
|
# Strategy: SearchEngine3
|
|
def search_engine_test(self, s):
|
|
url = "%s/api.php?q=%s" % (librey_apiurl, s)
|
|
response = requests.get(url, verify=False)
|
|
if response.status_code != 200:
|
|
return False
|
|
|
|
data = response.json()
|
|
|
|
if "results_source" in data:
|
|
del data["results_source"]
|
|
|
|
num_results = len(data)
|
|
|
|
return num_results > 2
|
|
|
|
# Strategy: RepeatedNumbers3
|
|
def repeated_numbers_test(self, s):
|
|
return bool(re.search(r"\d{3,}", s))
|