mirror of
https://github.com/gnh1201/caterpillar.git
synced 2024-11-26 23:41:44 +00:00
224 lines
7.1 KiB
Python
224 lines
7.1 KiB
Python
#!/usr/bin/python3
|
|
#
|
|
# alwaysonline.py
|
|
# Always Online implementation for Caterpillar Proxy
|
|
#
|
|
# Caterpillar Proxy - The simple web debugging proxy (formerly, php-httpproxy)
|
|
# Namyheon Go (Catswords Research) <gnh1201@gmail.com>
|
|
# https://github.com/gnh1201/caterpillar
|
|
# Created at: 2024-07-31
|
|
# Updated at: 2024-07-31
|
|
#
|
|
import socket
|
|
import ssl
|
|
import requests
|
|
from decouple import config
|
|
from elasticsearch import Elasticsearch, NotFoundError
|
|
import hashlib
|
|
from datetime import datetime, UTC
|
|
from base import Extension, Logger
|
|
|
|
logger = Logger(name="wayback")
|
|
|
|
try:
|
|
client_encoding = config("CLIENT_ENCODING")
|
|
es_host = config("ES_HOST")
|
|
es_index = config("ES_INDEX")
|
|
except Exception as e:
|
|
logger.error("[*] Invalid configuration", exc_info=e)
|
|
|
|
es = Elasticsearch([es_host])
|
|
|
|
|
|
def generate_id(url: str):
|
|
"""Generate a unique ID for a URL by hashing it."""
|
|
return hashlib.sha256(url.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def get_cached_page_from_google(url: str):
|
|
status_code, content = (0, b"")
|
|
|
|
# Google Cache URL
|
|
google_cache_url = "https://webcache.googleusercontent.com/search?q=cache:" + url
|
|
|
|
# Send a GET request to Google Cache URL
|
|
response = requests.get(google_cache_url)
|
|
|
|
# Check if the request was successful (status code 200)
|
|
if response.status_code == 200:
|
|
content = response.content # Extract content from response
|
|
else:
|
|
status_code = response.status_code
|
|
|
|
return status_code, content
|
|
|
|
|
|
# API documentation: https://archive.org/help/wayback_api.php
|
|
def get_cached_page_from_wayback(url: str):
|
|
status_code, content = (0, b"")
|
|
|
|
# Wayback Machine API URL
|
|
wayback_api_url = "http://archive.org/wayback/available?url=" + url
|
|
|
|
# Send a GET request to Wayback Machine API
|
|
response = requests.get(wayback_api_url)
|
|
|
|
# Check if the request was successful (status code 200)
|
|
if response.status_code == 200:
|
|
try:
|
|
# Parse JSON response
|
|
data = response.json()
|
|
archived_snapshots = data.get("archived_snapshots", {})
|
|
closest_snapshot = archived_snapshots.get("closest", {})
|
|
|
|
# Check if the URL is available in the archive
|
|
if closest_snapshot:
|
|
archived_url = closest_snapshot.get("url", "")
|
|
|
|
# If URL is available, fetch the content of the archived page
|
|
if archived_url:
|
|
archived_page_response = requests.get(archived_url)
|
|
status_code = archived_page_response.status_code
|
|
if status_code == 200:
|
|
content = archived_page_response.content
|
|
else:
|
|
status_code = 404
|
|
else:
|
|
status_code = 404
|
|
except:
|
|
status_code = 502
|
|
else:
|
|
status_code = response.status_code
|
|
|
|
return status_code, content
|
|
|
|
|
|
def get_cached_page_from_elasticsearch(url: str):
|
|
url_id = generate_id(url)
|
|
try:
|
|
result = es.get(index=es_index, id=url_id)
|
|
logger.info(result["_source"])
|
|
return 200, result["_source"]["content"].encode(client_encoding)
|
|
except NotFoundError:
|
|
return 404, b""
|
|
except Exception as e:
|
|
logger.error(f"Error fetching from Elasticsearch: {e}")
|
|
return 502, b""
|
|
|
|
|
|
def cache_to_elasticsearch(url: str, data: bytes):
|
|
url_id = generate_id(url)
|
|
timestamp = datetime.now(UTC).timestamp()
|
|
try:
|
|
es.index(
|
|
index=es_index,
|
|
id=url_id,
|
|
body={
|
|
"url": url,
|
|
"content": data.decode(client_encoding),
|
|
"timestamp": timestamp,
|
|
},
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error caching to Elasticsearch: {e}")
|
|
|
|
|
|
def get_page_from_origin_server(url: str):
|
|
try:
|
|
response = requests.get(url)
|
|
return response.status_code, response.content
|
|
except Exception as e:
|
|
return 502, str(e).encode(client_encoding)
|
|
|
|
|
|
class AlwaysOnline(Extension):
|
|
def __init__(self):
|
|
self.type = "connector" # this is a connector
|
|
self.connection_type = "alwaysonline"
|
|
self.buffer_size = 8192
|
|
|
|
def connect(self, conn: socket.socket, data: bytes, webserver: bytes, port: bytes, scheme: bytes, method: bytes, url: bytes):
|
|
logger.info("[*] Connecting... Connecting...")
|
|
|
|
connected = False
|
|
|
|
is_ssl = scheme in [b"https", b"tls", b"ssl"]
|
|
cache_hit = 0
|
|
buffered = b""
|
|
|
|
def sendall(_sock: socket.socket, _conn: socket.socket, _data: bytes):
|
|
# send first chuck
|
|
sock.send(_data)
|
|
if len(_data) < self.buffer_size:
|
|
return
|
|
|
|
# send following chunks
|
|
_conn.settimeout(1)
|
|
while True:
|
|
try:
|
|
chunk = _conn.recv(self.buffer_size)
|
|
if not chunk:
|
|
break
|
|
_sock.send(chunk)
|
|
except:
|
|
break
|
|
|
|
target_url = url.decode(client_encoding)
|
|
target_scheme = scheme.decode(client_encoding)
|
|
target_webserver = webserver.decode(client_encoding)
|
|
|
|
if "://" not in target_url:
|
|
target_url = f"{target_scheme}://{target_webserver}:{port}{target_url}"
|
|
|
|
if method == b"GET":
|
|
if not connected:
|
|
logger.info("Trying get data from Elasticsearch...")
|
|
status_code, content = get_cached_page_from_elasticsearch(target_url)
|
|
if status_code == 200:
|
|
buffered += content
|
|
cache_hit += 1
|
|
connected = True
|
|
|
|
if not connected:
|
|
logger.info("Trying get data from Wayback Machine...")
|
|
status_code, content = get_cached_page_from_wayback(target_url)
|
|
if status_code == 200:
|
|
buffered += content
|
|
cache_hit += 1
|
|
connected = True
|
|
|
|
if not connected:
|
|
logger.info("Trying get data from Google Website Cache...")
|
|
status_code, content = get_cached_page_from_google(target_url)
|
|
if status_code == 200:
|
|
buffered += content
|
|
cache_hit += 1
|
|
connected = True
|
|
|
|
if cache_hit == 0:
|
|
status_code, content = get_page_from_origin_server(target_url)
|
|
buffered += content
|
|
cache_to_elasticsearch(target_url, buffered)
|
|
|
|
conn.send(buffered)
|
|
else:
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
|
|
if is_ssl:
|
|
context = ssl.create_default_context()
|
|
context.check_hostname = False
|
|
context.verify_mode = ssl.CERT_NONE
|
|
|
|
sock = context.wrap_socket(
|
|
sock, server_hostname=webserver.decode(client_encoding)
|
|
)
|
|
sock.connect((webserver, port))
|
|
# sock.sendall(data)
|
|
sendall(sock, conn, data)
|
|
else:
|
|
sock.connect((webserver, port))
|
|
# sock.sendall(data)
|
|
sendall(sock, conn, data)
|
|
|
|
return connected
|