Возможен ли парсинг сайтов посредством скана IP?

RUB1K · 19.02.2025

Подумал, если массово сканировать ип на открытые порты (80,443), потом парсить ссылки сайта, затем сканировать их на уязвимости. Получается мы обходим парсинг из поисковиков)

Не бейте если что-то не то сказал)

Sexy Kawai · 19.02.2025

RUB1K сказал(а):

Подумал, если массово сканировать ип на открытые порты (80,443), потом парсить ссылки сайта, затем сканировать их на уязвимости. Получается мы обходим парсинг из поисковиков)

Не бейте если что-то не то сказал)

Писал как-то для себя, скачай пайтон и врубай.
Диапазон в коде под себя меняй и наслаждайся. 134.199.0.0 вот эти меняй, нули не трогай

Код:

import requests
from concurrent.futures import ThreadPoolExecutor
import ipaddress
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Настройки
IP_RANGE = "134.199.0.0/16"
PORTS = [80, 443]  # HTTP и HTTPS порты
TIMEOUT = 3  # Таймаут в секундах
MAX_THREADS = 150  # Количество потоков
RESULTS_FILE = "134.199.txt"  # Файл для сохранения результатов
MIN_CONTENT_LENGTH = 100  # Минимальная длина содержимого страницы
EXCLUDED_STRINGS = [
    "Welcome to nginx!",
    "fortinet",
    "mikrotik",
    "Our Website is Coming Soon!",
    "Apache2 Ubuntu Default Page",
    "mailcow UI",
    "Why am I seeing this page?",
    "Apache2 Debian Default Page",
    "Create domains and set up Web hosting using Parallels Plesk.",
    "This page is generated by Plesk,",
    "This Toolbox uses cookies. Find out more in our Cookie Policy.",
    "Apache2 Default Page",
    "Welcome to OpenResty!",
    "Apache Guacamole",
    "The site may have moved to a different server.",
    "Website not found",
    "Web Server's Default Page",
    "CUBEVISIONMODULE",
    "The Community ENTerprise Operating System",
    "Hikvision Digital Technology Co., Ltd. All Rights Reserved.",
    "You've successfully started the Nginx Proxy Manager.",
    "Huawei Technologies Co",
    "This Toolbox uses cookies. Find out more in our Cookie Policy.",
    "This page is generated by Parallels Plesk Panel",
    "没有找到站点",
    "您的请求在Web服务器中没有找到对应的站点！",
    "404 - 找不到文件或目录",
    "Welcome to nginx on AlmaLinux!",
    "This server is a part of a CDN service provided by bunny.net.",
    "Welcome to nginx on Debian!",
    "SSID scan may take up to 60 seconds to complete. During this time, your link will be unavailable.",
    "Welcome to nginx on EPEL!",
    "If you're seeing this, you've successfully installed Tomcat. Congratulations!",
    "CentOS-WebPanel.com",
    "Faithfully yours, nginx.",
    "This page is generated by Plesk, the leading hosting automation software.",
    "If you are the owner of this website, please contact your hosting provider:",
    "This is the default welcome page used to test the correct operation of the Apache2",
    "Welcome to HTTP Server Test Page!",
    "http://go.microsoft.com/fwlink/?linkid=66138&amp;clcid=0x409",
    "Welcome to nginx on Red Hat Enterprise Linux!",
    "powered by CWP | CentOS-WebPanel.com",
    "Sorry, Please confirm that this domain name has been bound to the website.",
    "404 Not Found",
    "502 Bad Gateway",
    "iisstart.png",
    "無効なURLです。",
    "Your web server is working. Now make it work for you.",
    "您的请求在Web服务器中没有找到对应的站点！",
    "<!DOCTYPE html><html><head><title>Home Assistant",
    "AlmaLinux Test Page"
]


def save_to_file(url):
    """Сохраняет URL в файл, если он ещё не записан."""
    if not os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'w') as file:
            pass  # Создаем пустой файл, если его нет

    with open(RESULTS_FILE, 'r') as file:
        if url in file.read():
            return  # URL уже записан

    with open(RESULTS_FILE, 'a') as file:
        file.write(url + '\n')
        file.flush()  # Принудительно сбрасываем буфер
        os.fsync(file.fileno())  # Принудительно записываем на диск


def is_valid_page(content):
    """Проверяет, является ли страница реальной веб-страницей."""
    if len(content) < MIN_CONTENT_LENGTH:
        return False  # Слишком короткая страница

    soup = BeautifulSoup(content, "html.parser")
    title = soup.title.string if soup.title and soup.title.string else ""
    if title.lower().strip() in ["it works!", "hagtos", ""]:
        return False  # Простой или отсутствующий заголовок

    if any(excluded in content for excluded in EXCLUDED_STRINGS):
        return False  # Содержит исключенные строки

    return True


def is_self_redirect(url, redirect_url):
    """Проверяет, является ли редирект переходом на тот же хост."""
    original = urlparse(url)
    redirected = urlparse(redirect_url)
    return original.netloc == redirected.netloc


def scan_ip(ip):
    """Проверяет IP-адрес на наличие веб-страницы на портах 80 и 443."""
    for port in PORTS:
        scheme = "http" if port == 80 else "https"
        url = f"{scheme}://{ip}"

        try:
            response = requests.get(url, timeout=TIMEOUT, allow_redirects=False, verify=False)

            # Проверка на редирект
            if 300 <= response.status_code < 400:
                redirect_url = response.headers.get('Location', '')
                if is_self_redirect(url, redirect_url):
                    print(f"[+] Редирект на себя: {url} -> {redirect_url}")
                else:
                    print(f"[-] Исключён из-за редиректа: {url} -> {redirect_url}")
                    continue

            # Проверка на успешный код ответа
            if response.status_code == 200:
                content = response.text.strip()
                if is_valid_page(content):
                    print(f"[+] Реальная страница: {url}")
                    save_to_file(url)  # Сохраняем результат сразу
                    return url
                else:
                    print(f"[-] Исключено как ненастоящая страница: {url}")
            else:
                print(f"[-] Недоступен: {url} (HTTP {response.status_code})")

        except requests.RequestException:
            print(f"[-] Нет доступа: {url}")
   
    return None


def main():
    """Основная функция для сканирования IP-адресов в диапазоне."""
    print(f"Начало сканирования диапазона {IP_RANGE} на порты {PORTS}")
    ip_network = ipaddress.ip_network(IP_RANGE)

    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        for result in executor.map(scan_ip, ip_network.hosts()):
            if result:
                print(f"[Результат сохранён]: {result}")

    print("\nСканирование завершено.")
    print(f"Результаты сохранены в файл: {RESULTS_FILE}")


if __name__ == "__main__":
    main()

RUB1K · 19.02.2025

Sexy Kawai сказал(а):

Писал как-то для себя, скачай пайтон и врубай.
Диапазон в коде под себя меняй и наслаждайся. 134.199.0.0 вот эти меняй, нули не трогай

Код:

import requests
from concurrent.futures import ThreadPoolExecutor
import ipaddress
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Настройки
IP_RANGE = "134.199.0.0/16"
PORTS = [80, 443]  # HTTP и HTTPS порты
TIMEOUT = 3  # Таймаут в секундах
MAX_THREADS = 150  # Количество потоков
RESULTS_FILE = "134.199.txt"  # Файл для сохранения результатов
MIN_CONTENT_LENGTH = 100  # Минимальная длина содержимого страницы
EXCLUDED_STRINGS = [
    "Welcome to nginx!",
    "fortinet",
    "mikrotik",
    "Our Website is Coming Soon!",
    "Apache2 Ubuntu Default Page",
    "mailcow UI",
    "Why am I seeing this page?",
    "Apache2 Debian Default Page",
    "Create domains and set up Web hosting using Parallels Plesk.",
    "This page is generated by Plesk,",
    "This Toolbox uses cookies. Find out more in our Cookie Policy.",
    "Apache2 Default Page",
    "Welcome to OpenResty!",
    "Apache Guacamole",
    "The site may have moved to a different server.",
    "Website not found",
    "Web Server's Default Page",
    "CUBEVISIONMODULE",
    "The Community ENTerprise Operating System",
    "Hikvision Digital Technology Co., Ltd. All Rights Reserved.",
    "You've successfully started the Nginx Proxy Manager.",
    "Huawei Technologies Co",
    "This Toolbox uses cookies. Find out more in our Cookie Policy.",
    "This page is generated by Parallels Plesk Panel",
    "没有找到站点",
    "您的请求在Web服务器中没有找到对应的站点！",
    "404 - 找不到文件或目录",
    "Welcome to nginx on AlmaLinux!",
    "This server is a part of a CDN service provided by bunny.net.",
    "Welcome to nginx on Debian!",
    "SSID scan may take up to 60 seconds to complete. During this time, your link will be unavailable.",
    "Welcome to nginx on EPEL!",
    "If you're seeing this, you've successfully installed Tomcat. Congratulations!",
    "CentOS-WebPanel.com",
    "Faithfully yours, nginx.",
    "This page is generated by Plesk, the leading hosting automation software.",
    "If you are the owner of this website, please contact your hosting provider:",
    "This is the default welcome page used to test the correct operation of the Apache2",
    "Welcome to HTTP Server Test Page!",
    "http://go.microsoft.com/fwlink/?linkid=66138&amp;clcid=0x409",
    "Welcome to nginx on Red Hat Enterprise Linux!",
    "powered by CWP | CentOS-WebPanel.com",
    "Sorry, Please confirm that this domain name has been bound to the website.",
    "404 Not Found",
    "502 Bad Gateway",
    "iisstart.png",
    "無効なURLです。",
    "Your web server is working. Now make it work for you.",
    "您的请求在Web服务器中没有找到对应的站点！",
    "<!DOCTYPE html><html><head><title>Home Assistant",
    "AlmaLinux Test Page"
]


def save_to_file(url):
    """Сохраняет URL в файл, если он ещё не записан."""
    if not os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'w') as file:
            pass  # Создаем пустой файл, если его нет

    with open(RESULTS_FILE, 'r') as file:
        if url in file.read():
            return  # URL уже записан

    with open(RESULTS_FILE, 'a') as file:
        file.write(url + '\n')
        file.flush()  # Принудительно сбрасываем буфер
        os.fsync(file.fileno())  # Принудительно записываем на диск


def is_valid_page(content):
    """Проверяет, является ли страница реальной веб-страницей."""
    if len(content) < MIN_CONTENT_LENGTH:
        return False  # Слишком короткая страница

    soup = BeautifulSoup(content, "html.parser")
    title = soup.title.string if soup.title and soup.title.string else ""
    if title.lower().strip() in ["it works!", "hagtos", ""]:
        return False  # Простой или отсутствующий заголовок

    if any(excluded in content for excluded in EXCLUDED_STRINGS):
        return False  # Содержит исключенные строки

    return True


def is_self_redirect(url, redirect_url):
    """Проверяет, является ли редирект переходом на тот же хост."""
    original = urlparse(url)
    redirected = urlparse(redirect_url)
    return original.netloc == redirected.netloc


def scan_ip(ip):
    """Проверяет IP-адрес на наличие веб-страницы на портах 80 и 443."""
    for port in PORTS:
        scheme = "http" if port == 80 else "https"
        url = f"{scheme}://{ip}"

        try:
            response = requests.get(url, timeout=TIMEOUT, allow_redirects=False, verify=False)

            # Проверка на редирект
            if 300 <= response.status_code < 400:
                redirect_url = response.headers.get('Location', '')
                if is_self_redirect(url, redirect_url):
                    print(f"[+] Редирект на себя: {url} -> {redirect_url}")
                else:
                    print(f"[-] Исключён из-за редиректа: {url} -> {redirect_url}")
                    continue

            # Проверка на успешный код ответа
            if response.status_code == 200:
                content = response.text.strip()
                if is_valid_page(content):
                    print(f"[+] Реальная страница: {url}")
                    save_to_file(url)  # Сохраняем результат сразу
                    return url
                else:
                    print(f"[-] Исключено как ненастоящая страница: {url}")
            else:
                print(f"[-] Недоступен: {url} (HTTP {response.status_code})")

        except requests.RequestException:
            print(f"[-] Нет доступа: {url}")
  
    return None


def main():
    """Основная функция для сканирования IP-адресов в диапазоне."""
    print(f"Начало сканирования диапазона {IP_RANGE} на порты {PORTS}")
    ip_network = ipaddress.ip_network(IP_RANGE)

    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        for result in executor.map(scan_ip, ip_network.hosts()):
            if result:
                print(f"[Результат сохранён]: {result}")

    print("\nСканирование завершено.")
    print(f"Результаты сохранены в файл: {RESULTS_FILE}")


if __name__ == "__main__":
    main()

Спасибо большое дружище!

Возможен ли парсинг сайтов посредством скана IP?

RUB1K

RAM

Sexy Kawai

RAID-массив

RUB1K

RAM