Update server.py, plugins/fediverse.py

2025-06-18 02:59:07 +00:00 · 2024-02-28 15:11:49 +09:00 · 2024-02-28 15:11:49 +09:00 · 7156b934cb
commit 7156b934cb
parent 18f928dbe8
2 changed files with 310 additions and 303 deletions
--- a/plugins/fediverse.py
+++ b/plugins/fediverse.py
@ -0,0 +1,269 @@
 #!/usr/bin/python3
 # 
 # fediverse.py
 # Fediverse (Mastodon, Misskey, Pleroma, ...) SPAM filter plugin for Caterpillar
 # 
 # Caterpillar - The simple and parasitic web proxy with spam filter
 # Namyheon Go (Catswords Research) <gnh1201@gmail.com>
 # https://github.com/gnh1201/caterpillar
 # Created at: 2022-10-06
 # Updated at: 2024-12-28
 # 
 import io
 import re
 import requests
 from decouple import config
 from PIL import Image
 from server import Filter
 try:
    truecaptcha_userid = config('TRUECAPTCHA_USERID')   # truecaptcha.org
    truecaptcha_apikey = config('TRUECAPTCHA_APIKEY')   # truecaptcha.org
    librey_apiurl = config("LIBREY_APIURL")    # https://github.com/Ahwxorg/librey
 except
    pass
 class Fediverse(Filter):
    def __init__(self):
        # Load data to use KnownWords4 strategy
        # Download data: https://github.com/dwyl/english-words
        self.known_words = []
        with open("words_alpha.txt", "r") as file:
            words = file.readlines()
            self.known_words = [word.strip() for word in words if len(word.strip()) > 3]
            print ("[*] Data loaded to use KnownWords4 strategy")
    def test(self, data):
        filtered = False
        # prevent cache confusing
        if data.find(b'<title>Welcome to nginx!</title>') > -1:
            return True
        # allowed conditions
        if method == b'GET' or url.find(b'/api') > -1:
            return False
        # convert to text
        data_length = len(data)
        text = data.decode(client_encoding, errors='ignore')
        error_rate = (data_length - len(text)) / data_length
        if error_rate > 0.2:    # it is a binary data
            return False
        # check ID with K-Anonymity strategy
        pattern = r'\b(?:(?<=\/@)|(?<=acct:))([a-zA-Z0-9]{10})\b'
        matches = list(set(re.findall(pattern, text)))
        if len(matches) > 0:
            print ("[*] Found ID: %s" % (', '.join(matches)))
            try:
                filtered = not all(map(self.pwnedpasswords_test, matches))
            except Exception as e:
                print ("[*] K-Anonymity strategy not working! %s" % (str(e)))
                filtered = True
        # feedback
        if filtered and len(matches) > 0:
            score = 0
            strategies = []
            # check ID with VowelRatio10 strategy
            def vowel_ratio_test(s):
                ratio = self.calculate_vowel_ratio(s)
                return ratio > 0.2 and ratio < 0.8
            if all(map(vowel_ratio_test, matches)):
                score += 1
                strategies.append('VowelRatio10')
            # check ID with Palindrome4 strategy
            if all(map(self.has_palindrome, matches)):
                score += 1
                strategies.append('Palindrome4')
            # check ID with KnownWords4 strategy
            if all(map(self.has_known_word, matches)):
                score += 2
                strategies.append('KnownWords4')
            # check ID with SearchEngine3 strategy
            if librey_apiurl != '' and all(map(self.search_engine_test, matches)):
                score += 1
                strategies.append('SearchEngine3')
            # check ID with RepeatedNumbers3 strategy
            if all(map(self.repeated_numbers_test, matches)):
                score += 1
                strategies.append('RepeatedNumbers3')
            # logging score
            with open('score.log', 'a') as file:
                file.write("%s\t%s\t%s\r\n" % ('+'.join(matches), str(score), '+'.join(strategies)))
            # make decision
            if score > 1:
                filtered = False
        # check an attached images (check images with Not-CAPTCHA strategy)
        if truecaptcha_userid != '' and not filtered and len(matches) > 0:
            def webp_to_png_base64(url):
                try:
                    response = requests.get(url)
                    img = Image.open(io.BytesIO(response.content))
                    img_png = img.convert("RGBA")
                    buffered = io.BytesIO()
                    img_png.save(buffered, format="PNG")
                    encoded_image = base64.b64encode(buffered.getvalue()).decode(client_encoding)
                    return encoded_image
                except:
                    return None
            urls = re.findall(r'https://[^\s"]+\.webp', text)
            if len(urls) > 0:
                for url in urls:
                    if filtered:
                        break
                    print ("[*] downloading... %s" % (url))
                    encoded_image = webp_to_png_base64(url)
                    print ("[*] downloaded.")
                    if encoded_image:
                        print ("[*] solving...")
                        try:
                            solved = truecaptcha_solve(encoded_image)
                            if solved:
                                print ("[*] solved: %s" % (solved))
                                filtered = filtered or (solved.lower() in ['ctkpaarr', 'spam'])
                            else:
                                print ("[*] not solved")
                        except Exception as e:
                            print ("[*] Not CAPTCHA strategy not working! %s" % (str(e)))
        return filtered
    # Strategy: K-Anonymity test - use api.pwnedpasswords.com
    def pwnedpasswords_test(self, s):
        # convert to lowercase
        s = s.lower()
        # SHA1 of the password
        p_sha1 = hashlib.sha1(s.encode()).hexdigest()
        # First 5 char of SHA1 for k-anonymity API use
        f5_sha1 = p_sha1[:5]
        # Last 5 char of SHA1 to match API output
        l5_sha1 = p_sha1[-5:]
        # Making GET request using Requests library
        response = requests.get(f'https://api.pwnedpasswords.com/range/{f5_sha1}')
        # Checking if request was successful
        if response.status_code == 200:
            # Parsing response text
            hashes = response.text.split('\r\n')
            # Using list comprehension to find matching hashes
            matching_hashes = [line.split(':')[0] for line in hashes if line.endswith(l5_sha1)]
            # If there are matching hashes, return True, else return False
            return bool(matching_hashes)
        else:
            raise Exception("api.pwnedpasswords.com response status: %s" % (str(response.status_code)))
        return False
    # Strategy: Not-CAPTCHA - use truecaptcha.org
    def truecaptcha_solve(self, encoded_image):
        url = 'https://api.apitruecaptcha.org/one/gettext'
        data = {
            'userid': truecaptcha_userid,
            'apikey': truecaptcha_apikey,
            'data': encoded_image,
            'mode': 'human'
        }
        response = requests.post(url = url, json = data)
        if response.status_code == 200:
            data = response.json()
            if 'error_message' in data:
                print ("[*] Error: %s" % (data['error_message']))
                return None
            if 'result' in data:
                return data['result']
        else:
            raise Exception("api.apitruecaptcha.org response status: %s" % (str(response.status_code)))
        return None
    # Strategy: VowelRatio10
    def calculate_vowel_ratio(self, s):
        # Calculate the length of the string.
        length = len(s)
        if length == 0:
            return 0.0
        # Count the number of vowels ('a', 'e', 'i', 'o', 'u', 'w', 'y') in the string.
        vowel_count = sum(1 for char in s if char.lower() in 'aeiouwy')
        # Define vowel-ending patterns
        vowel_ending_patterns = ['ang', 'eng', 'ing', 'ong', 'ung', 'ank', 'ink', 'dge']
        # Count the occurrences of vowel-ending patterns in the string.
        vowel_count += sum(s.count(pattern) for pattern in vowel_ending_patterns)
        # Calculate the ratio of vowels to the total length of the string.
        vowel_ratio = vowel_count / length
        return vowel_ratio
    # Strategy: Palindrome4
    def has_palindrome(self, input_string):
        def is_palindrome(s):
            return s == s[::-1]
        input_string = input_string.lower()
        n = len(input_string)
        for i in range(n):
            for j in range(i + 4, n + 1):  # Find substrings of at least 5 characters
                substring = input_string[i:j]
                if is_palindrome(substring):
                    return True
        return False
    # Strategy: KnownWords4
    def has_known_word(self, input_string):
        def is_known_word(s):
            return s in self.known_words
        input_string = input_string.lower()
        n = len(input_string)
        for i in range(n):
            for j in range(i + 4, n + 1):  # Find substrings of at least 5 characters
                substring = input_string[i:j]
                if is_known_word(substring):
                    return True
        return False
    # Strategy: SearchEngine3
    def search_engine_test(self, s):
        url = "%s/api.php?q=%s" % (librey_apiurl, s)
        response = requests.get(url, verify=False)
        if response.status_code != 200:
            return False
        data = response.json()
        if 'results_source' in data:
            del data['results_source']
        num_results = len(data)
        return num_results > 2
    # Strategy: RepeatedNumbers3
    def repeated_numbers_test(self, s):
        return bool(re.search(r'\d{3,}', s))
--- a/server.py
+++ b/server.py
@ -1,8 +1,13 @@
 #!/usr/bin/python3
 # 
 # server.py
 # 
 # Caterpillar - The simple and parasitic web proxy with spam filter
 # Namyheon Go (Catswords Research) <gnh1201@gmail.com>
 # https://github.com/gnh1201/caterpillar
 # Created at: 2022-10-06
 # Updated at: 2024-12-28
 # 
 import argparse
 import socket
@ -13,16 +18,11 @@ import base64
 import json
 import ssl
 import time
 import re
 import hashlib
 import resource
 import traceback
 import io
 import textwrap
 from subprocess import Popen, PIPE
 from datetime import datetime
 from platform import python_version
 from PIL import Image
 import requests
 from decouple import config
@ -39,11 +39,6 @@ try:
    client_encoding = config('CLIENT_ENCODING')
    local_domain = config('LOCAL_DOMAIN')
    proxy_pass = config('PROXY_PASS')
    mastodon_server = config('MASTODON_SERVER')   # catswords.social
    mastodon_user_token = config('MASTODON_USER_TOKEN')   # catswords.social
    truecaptcha_userid = config('TRUECAPTCHA_USERID')   # truecaptcha.org
    truecaptcha_apikey = config('TRUECAPTCHA_APIKEY')   # truecaptcha.org
    librey_apiurl = config("LIBREY_APIURL")    # https://github.com/Ahwxorg/librey
 except KeyboardInterrupt:
    print("\n[*] User has requested an interrupt")
    print("[*] Application Exiting.....")
@ -60,41 +55,6 @@ buffer_size = args.buffer_size
 accepted_relay = {}
 resolved_address_list = []
 # https://stackoverflow.com/questions/25475906/set-ulimit-c-from-outside-shell
 resource.setrlimit(
    resource.RLIMIT_CORE,
    (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
 # load data to use KnownWords4 strategy
 # Download data: https://github.com/dwyl/english-words
 known_words = []
 if os.path.exists("words_alpha.txt"):
    with open("words_alpha.txt", "r") as file:
        words = file.readlines()
        known_words = [word.strip() for word in words if len(word.strip()) > 3]
        print ("[*] data loaded to use KnownWords4 strategy")
 def start():    #Main Program
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind(('', listening_port))
        sock.listen(max_connection)
        print("[*] Server started successfully [ %d ]" %(listening_port))
    except Exception:
        print("[*] Unable to Initialize Socket")
        print(Exception)
        sys.exit(2)
    while True:
        try:
            conn, addr = sock.accept() #Accept connection from client browser
            data = conn.recv(buffer_size) #Recieve client data
            start_new_thread(conn_string, (conn, data, addr)) #Starting a thread
        except KeyboardInterrupt:
            sock.close()
            print("\n[*] Graceful Shutdown")
            sys.exit(1)
 def jsonrpc2_create_id(data):
    return hashlib.sha1(json.dumps(data).encode(client_encoding)).hexdigest()
@ -214,123 +174,9 @@ def proxy_connect(webserver, conn):
 def proxy_check_filtered(data, webserver, port, scheme, method, url):
    filtered = False
-    # prevent cache confusing
+    for f in Filter.get_filters():
-    if data.find(b'<title>Welcome to nginx!</title>') > -1:
+        if not filtered:
-        return True
+            filtered = f.test(data)
    # allowed conditions
    if method == b'GET' or url.find(b'/api') > -1:
        return False
    # convert to text
    data_length = len(data)
    text = data.decode(client_encoding, errors='ignore')
    error_rate = (data_length - len(text)) / data_length
    if error_rate > 0.2:    # it is a binary data
        return False
    # check ID with K-Anonymity strategy
    pattern = r'\b(?:(?<=\/@)|(?<=acct:))([a-zA-Z0-9]{10})\b'
    matches = list(set(re.findall(pattern, text)))
    if len(matches) > 0:
        print ("[*] Found ID: %s" % (', '.join(matches)))
        try:
            filtered = not all(map(pwnedpasswords_test, matches))
        except Exception as e:
            print ("[*] K-Anonymity strategy not working! %s" % (str(e)))
            filtered = True
    # feedback
    if filtered and len(matches) > 0:
        score = 0
        strategies = []
        # check ID with VowelRatio10 strategy
        def vowel_ratio_test(s):
            ratio = calculate_vowel_ratio(s)
            return ratio > 0.2 and ratio < 0.8
        if all(map(vowel_ratio_test, matches)):
            score += 1
            strategies.append('VowelRatio10')
        # check ID with Palindrome4 strategy
        if all(map(has_palindrome, matches)):
            score += 1
            strategies.append('Palindrome4')
        # check ID with KnownWords4 strategy
        if all(map(has_known_word, matches)):
            score += 2
            strategies.append('KnownWords4')
        # check ID with SearchEngine3 strategy
        if librey_apiurl != '' and all(map(search_engine_test, matches)):
            score += 1
            strategies.append('SearchEngine3')
        # check ID with RepeatedNumbers3 strategy
        if all(map(repeated_numbers_test, matches)):
            score += 1
            strategies.append('RepeatedNumbers3')
        # logging score
        with open('score.log', 'a') as file:
            file.write("%s\t%s\t%s\r\n" % ('+'.join(matches), str(score), '+'.join(strategies)))
        # make decision
        if score > 1:
            filtered = False
    # check an attached images (check images with Not-CAPTCHA strategy)
    if truecaptcha_userid != '' and not filtered and len(matches) > 0:
        def webp_to_png_base64(url):
            try:
                response = requests.get(url)
                img = Image.open(io.BytesIO(response.content))
                img_png = img.convert("RGBA")
                buffered = io.BytesIO()
                img_png.save(buffered, format="PNG")
                encoded_image = base64.b64encode(buffered.getvalue()).decode(client_encoding)
                return encoded_image
            except:
                return None
        urls = re.findall(r'https://[^\s"]+\.webp', text)
        if len(urls) > 0:
            for url in urls:
                if filtered:
                    break
                print ("[*] downloading... %s" % (url))
                encoded_image = webp_to_png_base64(url)
                print ("[*] downloaded.")
                if encoded_image:
                    print ("[*] solving...")
                    try:
                        solved = truecaptcha_solve(encoded_image)
                        if solved:
                            print ("[*] solved: %s" % (solved))
                            filtered = filtered or (solved.lower() in ['ctkpaarr', 'spam'])
                        else:
                            print ("[*] not solved")
                    except Exception as e:
                        print ("[*] Not CAPTCHA strategy not working! %s" % (str(e)))
    # take action
    if filtered:
        print ("[*] Filtered from %s:%s" % (webserver.decode(client_encoding), str(port)))
        try:
            savedir = './savedfiles'
            if not os.path.exists(savedir):
                os.makedirs(savedir)
            current_time = datetime.now().strftime("%Y%m%d%H%M%S")
            file_path = os.path.join(savedir, ("%s_%s.bin" % (current_time, webserver.decode(client_encoding))))
            with open(file_path, 'wb') as file:
                file.write(data)
            print ("[*] Saved the file: %s" % (file_path))
        except Exception as e:
            print ("[*] Failed to save the file: %s" % (str(e)))
    return filtered
@ -599,153 +445,45 @@ def add_filtered_host(domain, ip_address):
        lines.append(f"{ip_address}\t{domain}\n")
        with open(hosts_path, 'w') as file:
            file.writelines(lines)
        if mastodon_user_token != '':    # notify to catswords.social
            post_status_to_mastodon(f"[{mastodon_server} user]\r\n\r\n{domain} is a domain with suspicious spam activity.\r\n\r\n#catswords")
-# notify to mastodon server
+def start():    #Main Program
-def post_status_to_mastodon(text, media_ids=None, poll_options=None, poll_expires_in=None, scheduled_at=None, idempotency_key=None):
+    try:
-    url = f"https://{mastodon_server}/api/v1/statuses"
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    headers = {
+        sock.bind(('', listening_port))
-        "Authorization": f"Bearer {user_token}",
+        sock.listen(max_connection)
-        "Content-Type": "application/x-www-form-urlencoded",
+        print("[*] Server started successfully [ %d ]" %(listening_port))
-    }
+    except Exception:
-    form_data = {
+        print("[*] Unable to Initialize Socket")
-        "status": text,
+        print(Exception)
-        "media_ids[]": media_ids,
+        sys.exit(2)
        "poll[options][]": poll_options,
        "poll[expires_in]": poll_expires_in,
        "scheduled_at": scheduled_at,
    }
    if idempotency_key:
        headers["Idempotency-Key"] = idempotency_key
-    response = requests.post(url, headers=headers, data=form_data)
+    while True:
-    return response.json()
+        try:
            conn, addr = sock.accept() #Accept connection from client browser
            data = conn.recv(buffer_size) #Recieve client data
            start_new_thread(conn_string, (conn, data, addr)) #Starting a thread
        except KeyboardInterrupt:
            sock.close()
            print("\n[*] Graceful Shutdown")
            sys.exit(1)
-# Strategy: K-Anonymity test - use api.pwnedpasswords.com
+class Filter():
-def pwnedpasswords_test(s):
+    filters = []
    # convert to lowercase
    s = s.lower()
-    # SHA1 of the password
+    @classmethod
-    p_sha1 = hashlib.sha1(s.encode()).hexdigest()
+    def register(cls, f):
        cls.filters.append(f)
-    # First 5 char of SHA1 for k-anonymity API use
+    @classmethod
-    f5_sha1 = p_sha1[:5]
+    def get_filters(cls):
        return cls.filters
-    # Last 5 char of SHA1 to match API output
+    def test(self, data):
-    l5_sha1 = p_sha1[-5:]
+        print ("[*] Not implemented")
    # Making GET request using Requests library
    response = requests.get(f'https://api.pwnedpasswords.com/range/{f5_sha1}')
    # Checking if request was successful
    if response.status_code == 200:
        # Parsing response text
        hashes = response.text.split('\r\n')
        # Using list comprehension to find matching hashes
        matching_hashes = [line.split(':')[0] for line in hashes if line.endswith(l5_sha1)]
        # If there are matching hashes, return True, else return False
        return bool(matching_hashes)
    else:
        raise Exception("api.pwnedpasswords.com response status: %s" % (str(response.status_code)))
    return False
 # Strategy: Not-CAPTCHA - use truecaptcha.org
 def truecaptcha_solve(encoded_image):
    url = 'https://api.apitruecaptcha.org/one/gettext'
    data = {
        'userid': truecaptcha_userid,
        'apikey': truecaptcha_apikey,
        'data': encoded_image,
        'mode': 'human'
    }
    response = requests.post(url = url, json = data)
    if response.status_code == 200:
        data = response.json()
        if 'error_message' in data:
            print ("[*] Error: %s" % (data['error_message']))
            return None
        if 'result' in data:
            return data['result']
    else:
        raise Exception("api.apitruecaptcha.org response status: %s" % (str(response.status_code)))
    return None
 # Strategy: VowelRatio10
 def calculate_vowel_ratio(s):
    # Calculate the length of the string.
    length = len(s)
    if length == 0:
        return 0.0
    # Count the number of vowels ('a', 'e', 'i', 'o', 'u', 'w', 'y') in the string.
    vowel_count = sum(1 for char in s if char.lower() in 'aeiouwy')
    # Define vowel-ending patterns
    vowel_ending_patterns = ['ang', 'eng', 'ing', 'ong', 'ung', 'ank', 'ink', 'dge']
    # Count the occurrences of vowel-ending patterns in the string.
    vowel_count += sum(s.count(pattern) for pattern in vowel_ending_patterns)
    # Calculate the ratio of vowels to the total length of the string.
    vowel_ratio = vowel_count / length
    return vowel_ratio
 # Strategy: Palindrome4
 def has_palindrome(input_string):
    def is_palindrome(s):
        return s == s[::-1]
    input_string = input_string.lower()
    n = len(input_string)
    for i in range(n):
        for j in range(i + 4, n + 1):  # Find substrings of at least 5 characters
            substring = input_string[i:j]
            if is_palindrome(substring):
                return True
    return False
 # Strategy: KnownWords4
 def has_known_word(input_string):
    def is_known_word(s):
        return s in known_words
    input_string = input_string.lower()
    n = len(input_string)
    for i in range(n):
        for j in range(i + 4, n + 1):  # Find substrings of at least 5 characters
            substring = input_string[i:j]
            if is_known_word(substring):
                return True
    return False
 # Strategy: SearchEngine3
 def search_engine_test(s):
    url = "%s/api.php?q=%s" % (librey_apiurl, s)
    response = requests.get(url, verify=False)
    if response.status_code != 200:
        return False
    data = response.json()
    if 'results_source' in data:
        del data['results_source']
    num_results = len(data)
    return num_results > 2
 # Strategy: RepeatedNumbers3
 def repeated_numbers_test(s):
    return bool(re.search(r'\d{3,}', s))
 if __name__== "__main__":
    # load filters
    Filter.register(importlib.import_module("plugins.fediverse").Fediverse())
    # start
    start()