caterpillar/plugins/alwaysonline.py

#!/usr/bin/python3
#
# alwaysonline.py
# Always Online implementation for Caterpillar Proxy
#
# Caterpillar Proxy - The simple web debugging proxy (formerly, php-httpproxy)
# Namyheon Go (Catswords Research) <gnh1201@gmail.com>
# https://github.com/gnh1201/caterpillar
# Created at: 2024-07-31
# Updated at: 2024-07-31
#
import socket
import ssl
import requests
from decouple import config
from elasticsearch import Elasticsearch, NotFoundError
import hashlib
from datetime import datetime, UTC
from base import Extension, Logger

logger = Logger(name="wayback")

try:
    client_encoding = config("CLIENT_ENCODING")
    es_host = config("ES_HOST")
    es_index = config("ES_INDEX")
except Exception as e:
    logger.error("[*] Invalid configuration", exc_info=e)

es = Elasticsearch([es_host])


def generate_id(url: str):
    """Generate a unique ID for a URL by hashing it."""
    return hashlib.sha256(url.encode("utf-8")).hexdigest()


def get_cached_page_from_google(url: str):
    status_code, content = (0, b"")

    # Google Cache URL
    google_cache_url = "https://webcache.googleusercontent.com/search?q=cache:" + url

    # Send a GET request to Google Cache URL
    response = requests.get(google_cache_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        content = response.content  # Extract content from response
    else:
        status_code = response.status_code

    return status_code, content


# API documentation: https://archive.org/help/wayback_api.php
def get_cached_page_from_wayback(url: str):
    status_code, content = (0, b"")

    # Wayback Machine API URL
    wayback_api_url = "http://archive.org/wayback/available?url=" + url

    # Send a GET request to Wayback Machine API
    response = requests.get(wayback_api_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        try:
            # Parse JSON response
            data = response.json()
            archived_snapshots = data.get("archived_snapshots", {})
            closest_snapshot = archived_snapshots.get("closest", {})

            # Check if the URL is available in the archive
            if closest_snapshot:
                archived_url = closest_snapshot.get("url", "")

                # If URL is available, fetch the content of the archived page
                if archived_url:
                    archived_page_response = requests.get(archived_url)
                    status_code = archived_page_response.status_code
                    if status_code == 200:
                        content = archived_page_response.content
                else:
                    status_code = 404
            else:
                status_code = 404
        except:
            status_code = 502
    else:
        status_code = response.status_code

    return status_code, content


def get_cached_page_from_elasticsearch(url: str):
    url_id = generate_id(url)
    try:
        result = es.get(index=es_index, id=url_id)
        logger.info(result["_source"])
        return 200, result["_source"]["content"].encode(client_encoding)
    except NotFoundError:
        return 404, b""
    except Exception as e:
        logger.error(f"Error fetching from Elasticsearch: {e}")
        return 502, b""


def cache_to_elasticsearch(url: str, data: bytes):
    url_id = generate_id(url)
    timestamp = datetime.now(UTC).timestamp()
    try:
        es.index(
            index=es_index,
            id=url_id,
            body={
                "url": url,
                "content": data.decode(client_encoding),
                "timestamp": timestamp,
            },
        )
    except Exception as e:
        logger.error(f"Error caching to Elasticsearch: {e}")


def get_page_from_origin_server(url: str):
    try:
        response = requests.get(url)
        return response.status_code, response.content
    except Exception as e:
        return 502, str(e).encode(client_encoding)


class AlwaysOnline(Extension):
    def __init__(self):
        self.type = "connector"  # this is a connector
        self.connection_type = "alwaysonline"
        self.buffer_size = 8192

    def connect(self, conn: socket.socket, data: bytes, webserver: bytes, port: bytes, scheme: bytes, method: bytes, url: bytes):
        logger.info("[*] Connecting... Connecting...")

        connected = False

        is_ssl = scheme in [b"https", b"tls", b"ssl"]
        cache_hit = 0
        buffered = b""

        def sendall(_sock: socket.socket, _conn: socket.socket, _data: bytes):
            # send first chuck
            sock.send(_data)
            if len(_data) < self.buffer_size:
                return

            # send following chunks
            _conn.settimeout(1)
            while True:
                try:
                    chunk = _conn.recv(self.buffer_size)
                    if not chunk:
                        break
                    _sock.send(chunk)
                except:
                    break

        target_url = url.decode(client_encoding)
        target_scheme = scheme.decode(client_encoding)
        target_webserver = webserver.decode(client_encoding)

        if "://" not in target_url:
            target_url = f"{target_scheme}://{target_webserver}:{port}{target_url}"

        if method == b"GET":
            if not connected:
                logger.info("Trying get data from Elasticsearch...")
                status_code, content = get_cached_page_from_elasticsearch(target_url)
                if status_code == 200:
                    buffered += content
                    cache_hit += 1
                    connected = True

            if not connected:
                logger.info("Trying get data from Wayback Machine...")
                status_code, content = get_cached_page_from_wayback(target_url)
                if status_code == 200:
                    buffered += content
                    cache_hit += 1
                    connected = True

            if not connected:
                logger.info("Trying get data from Google Website Cache...")
                status_code, content = get_cached_page_from_google(target_url)
                if status_code == 200:
                    buffered += content
                    cache_hit += 1
                    connected = True

            if cache_hit == 0:
                status_code, content = get_page_from_origin_server(target_url)
                buffered += content
                cache_to_elasticsearch(target_url, buffered)

            conn.send(buffered)
        else:
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            if is_ssl:
                context = ssl.create_default_context()
                context.check_hostname = False
                context.verify_mode = ssl.CERT_NONE

                sock = context.wrap_socket(
                    sock, server_hostname=webserver.decode(client_encoding)
                )
                sock.connect((webserver, port))
                # sock.sendall(data)
                sendall(sock, conn, data)
            else:
                sock.connect((webserver, port))
                # sock.sendall(data)
                sendall(sock, conn, data)

        return connected