web scraper for machine learning

spidermoon77 · 29.05.2023

I'm making a web scraper for machine learning models like NLP. I plan to update the script regularly, It is pretty slow because of mutiple requests etc. so, shen testing use low scrape count [0-90]

Python:

import requests
import os
import hashlib,os
import re, signal, sys
import urllib3
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import codecs

def createFolders(urls, scraped, root_folder):
    os.makedirs(root_folder, exist_ok=True) # Create the root folder
 
    for i in range(len(urls)):
        subfolder_name = str(i)
        subfolder_path = os.path.join(root_folder, subfolder_name)
        os.makedirs(subfolder_path, exist_ok=True) # Create the subfolder
     
        file_path = os.path.join(subfolder_path, subfolder_name + ".txt")
        try:
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(str(scraped[i]))
        except UnicodeEncodeError:
            print(f"Skipping file {file_path} due to UnicodeEncodeError.")
 
def scrapeContent(url):
    global scraped
    try:
        response = requests.get(url)
        content = bs(response.content, "lxml")
        return content
     
    except requests.exceptions.InvalidURL as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.RequestException as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.InvalidSchema as e:
        print("Skipping URL because " + str(e))

def getScrapeList(url):
    try:
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        response = requests.get(url, verify=False)
        content = bs(response.content, "lxml")
        a_array = []
        hrefs = content.find_all("a")
        for href in hrefs:
            a = href.get("href")
            if a is not None:
                if a.startswith("http"):
                    True
                else:
                    a = urljoin(url, a)
                a_array.append(a)
        return a_array
    except requests.exceptions.InvalidURL as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.RequestException as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.InvalidSchema as e:
        print("Skipping URL because " + str(e))
     
def signal_handler(signal, frame):
    print("\nProgram interrupted.")
    # Perform cleanup actions here, if needed
    sys.exit(0)
 
def main():
    # Set the custom signal handler for Ctrl+C
    signal.signal(signal.SIGINT, signal_handler)
 
    print("---Webscraper v0---")
    print("------------")
    print("PRESS Ctrl+C to stop scraping")
    print("------------")
 
    try:
        url = input("Enter URL: ")
        root_folder = input("Enter root folder name: ")
        maxurl = int(input("Enter scrape count: "))
     
        print("Scanning for URLs to scrape...")
        # Creating initial scrapelist
        scrapelist = list(set(getScrapeList(url)))
     
        # Check every URL for adding to scrapelist
        for u in scrapelist:
            if maxcurl > 0:
                maxurl=maxurl-1
                urls = getScrapeList(u)
                if urls is not None:
                    scrapelist = list(set(scrapelist + urls))
                else:
                    break
        print("Total amount of URLs found: " + str(len(scrapelist)))
     
        print("Scraping found URLs")
        # Get the webpage content
        scraped = []
        for u in scrapelist:
            scraped.append(scrapeContent(u))
        print("Done scraping")
        print("Saving data in folders...")
        # Create folders with webpage title as folder name
        createFolders(scrapelist, scraped, root_folder)
        print("Succesfully scraped content")

    except KeyboardInterrupt:
        # Handle the user interrupt
        print("\nProgram interrupted.")
        # Perform cleanup actions here, if needed
        sys.exit(0)

 
main()

alex778 · 29.05.2023

1. Nicely done for 1995, Sergey Brin.
2. Wake up Neo, ты обосрался. В 2к23 все пользуются готовыми датасетами.
3. Скрапить энторнет без каких-либо таймаутов - плохая идея, зависать будет скрипт.

andinov · 31.05.2023

These suggestions should help improve the code quality, reliability, and maintainability of the web scraping script. Remember to test the code thoroughly and adapt it to your specific requirements.

Python:

import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import hashlib
import urllib3
import codecs
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_folders(urls, scraped, root_folder):
    os.makedirs(root_folder, exist_ok=True)  # Create the root folder

    for i, url in enumerate(urls):
        subfolder_name = str(i)
        subfolder_path = os.path.join(root_folder, subfolder_name)
        os.makedirs(subfolder_path, exist_ok=True)  # Create the subfolder

        file_path = os.path.join(subfolder_path, subfolder_name + ".txt")
        try:
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(str(scraped[i]))
        except UnicodeEncodeError:
            logger.warning(f"Skipping file {file_path} due to UnicodeEncodeError.")


def scrape_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for unsuccessful requests

        content = BeautifulSoup(response.content, "lxml")
        return content

    except requests.exceptions.RequestException as e:
        logger.exception(f"Skipping URL {url} due to error: {e}")


def get_scrape_list(url):
    try:
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        response = requests.get(url, verify=False)
        response.raise_for_status()

        content = BeautifulSoup(response.content, "lxml")
        a_array = []
        hrefs = content.find_all("a")
        for href in hrefs:
            a = href.get("href")
            if a is not None:
                if a.startswith("http"):
                    True
                else:
                    a = urljoin(url, a)
                a_array.append(a)
        return a_array

    except requests.exceptions.RequestException as e:
        logger.exception(f"Skipping URL {url} due to error: {e}")


def main():
    logger.info("---Webscraper v0---")
    logger.info("------------")
    logger.info("PRESS Ctrl+C to stop scraping")
    logger.info("------------")

    try:
        url = input("Enter URL: ").strip()
        root_folder = input("Enter root folder name: ").strip()
        max_urls = int(input("Enter scrape count: "))

        logger.info("Scanning for URLs to scrape...")
        scrape_list = list(set(get_scrape_list(url)))

        # Check every URL for adding to scrape_list
        while max_urls > 0 and scrape_list:
            max_urls -= 1
            url_to_scrape = scrape_list.pop(0)
            new_urls = get_scrape_list(url_to_scrape)
            if new_urls is not None:
                scrape_list.extend(new_urls)

        logger.info("Total number of URLs found: " + str(len(scrape_list)))

        logger.info("Scraping found URLs...")
        scraped = []
        for url_to_scrape in scrape_list:
            content = scrape_content(url_to_scrape)
            if content:
                scraped append(content)

        logger.info("Done scraping")
        logger.info("Saving data in folders...")
        create_folders(scrape_list, scraped, root_folder)
        logger.info("Successfully scraped content")

    except KeyboardInterrupt:
        logger.info("\nProgram interrupted.")
        sys.exit(0)

    exceptValueError:
        logger.error("Invalid input. Please provide valid inputs.")
        sys.exit(1)

    except Exception as e:
        logger.exception("An error occurred: " + str(e))
        sys.exit(1)


if __name__ == "__main__":
    main()

spidermoon77 · 30.07.2023

Код:

import requests
import os
import hashlib, os
import re, signal, sys
import urllib3
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import codecs
    
def createDataFile(scraped, p, root_folder, max_chars_per_file, option):
    file_counter = 0
    chars_written = 0
    os.makedirs(root_folder, exist_ok=True)
    
    if option == 0:
        data = scraped
    else:
        data = p
    
    for element in data:
        element_length = len(str(element)) + len("\n")  # Account for newline character
        
        # If the element length is larger than max_chars_per_file, split it into multiple files
        if element_length > max_chars_per_file:
            for i in range(0, element_length, max_chars_per_file):
                file_name = "data_" + str(file_counter)
                file_path = os.path.join(root_folder, file_name + ".txt")
                with open(file_path, "a", encoding="utf-8") as f:
                    f.write(str(element)[i:i + max_chars_per_file] + "\n")
                    file_counter += 1
        else:
            # If the element length is within the limit, write it to the current file
            file_name = "data_" + str(file_counter)
            file_path = os.path.join(root_folder, file_name + ".txt")
            with open(file_path, "a", encoding="utf-8") as f:
                f.write(str(element) + "\n")
                chars_written += element_length
                
                # If the current file reaches the limit, reset counters for the next file
                if chars_written >= max_chars_per_file:
                    chars_written = 0
                    file_counter += 1
                    
def extract_text_from_paragraphs(soup):
    paragraphs = soup.find_all("p")
    text_content = "\n".join(paragraph.get_text() for paragraph in paragraphs)
    return text_content
    
def scrapeContent(url):
    global scraped
    try:
        # Set a custom User-Agent header to mimic a real browser
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
        response = requests.get(url, headers=headers)
        content = bs(response.content, "lxml")
        par = extract_text_from_paragraphs(content)
        return [content, par]
        
    except requests.exceptions.InvalidURL as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.RequestException as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.InvalidSchema as e:
        print("Skipping URL because " + str(e))

def getScrapeList(url):
    try:
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        response = requests.get(url, verify=False)
        content = bs(response.content, "lxml")
        a_array = []
        hrefs = content.find_all("a")
        for href in hrefs:
            a = href.get("href")
            if a is not None:
                if a.startswith("http"):
                    True
                else:
                    a = urljoin(url, a)
                a_array.append(a)
        return a_array
    except requests.exceptions.InvalidURL as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.RequestException as e:
        print("Skipping URL because " + str(e))
    except requests.exceptions.InvalidSchema as e:
        print("Skipping URL because " + str(e))
        
def signal_handler(signal, frame):
    print("\nProgram interrupted.")
    # Perform cleanup actions here, if needed
    sys.exit(0)
    
def main():
    # Set the custom signal handler for Ctrl+C
    signal.signal(signal.SIGINT, signal_handler)
    
    print("---Webscraper v0---")
    print("-------------------")
    print("PRESS Ctrl+C to stop scraping")
    print("-------------------")
    
    try:
        url = input("Enter URL: ")
        root_folder = input("Enter root folder name: ")
        maxurl = int(input("Enter scrape count: "))
        max_chars_per_file = int(input("Enter data file char limit: "))
        option = int(input("Enter 0 for all content or 1 for paragraphs only: "))
        
        print("Scanning for URLs to scrape...")
        # Creating initial scrapelist
        scrapelist = list(set(getScrapeList(url)))
        
        # Check every URL for adding to scrapelist
        for u in scrapelist:
            if maxurl > 0:
                maxurl = maxurl-1
                urls = getScrapeList(u)
                if urls is not None:
                    scrapelist = list(set(scrapelist + urls))
                else:
                    break
        print("Total amount of URLs found: " + str(len(scrapelist)))
        scrapelist.sort()
        for i in scrapelist:
            print(i)       
        print("Scraping found URLs")
        # Get the webpage content
        scraped = []
        p = []
        for u in scrapelist:
            scraped.append(scrapeContent(u)[0])
            p.append(scrapeContent(u)[1])
        print("Done scraping")
        print("Saving data in folders...")
        print(p)
        # Create folder and files
        createDataFile(scraped, p, root_folder, max_chars_per_file, option)
        print("Succesfully scraped content")

    except KeyboardInterrupt:
        # Handle the user interrupt
        print("\nProgram interrupted.")
        # Perform cleanup actions here, if needed
        sys.exit(0)

    
main()

Код:

Updated the code with:
-headers to better mimic browser
-give user choose for limit chars in data text file
-choose between all content or text from paragraphs only

STILL HAVE TO INCORPERATE andinov's updates

WORK IN PROGRESS

sikidok · 30.07.2023

How I can train my own AI with my own data?
I need specific bot.

Where and how you're gonna use the scraped data?

Please show working example with some small dataset.

darkcoder_io · 30.07.2023

alex778 сказал(а):

Скрапить энторнет без каких-либо таймаутов - плохая идея, зависать будет скрипт.

Это если ему еще дадут что-либо скрапить, ибо с той стороны антифрод не дремлет, а использования качественного антидетекта в коде я не усмотрел (правда и не смотрел особо, так, по диагонали).

spidermoon77 · 30.07.2023

Here is an example of an RNN text generator made with tensorflow

Код:

import tensorflow as tf

import numpy as np
import os
import time

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

# Take a look at the first 250 characters in text
print(text[:250])

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

ids = ids_from_chars(chars)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
    
chars = chars_from_ids(ids)

tf.strings.reduce_join(chars, axis=-1).numpy()

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
 
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))
    
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))
 
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())
 
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
    
split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
    
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x
      
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)
    
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
model.summary()

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

tf.exp(example_batch_mean_loss).numpy()

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# Load the model from the checkpoint if it exists
if os.path.exists(checkpoint_dir):
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint is not None:
        print(f"Loading weights from checkpoint: {latest_checkpoint}")
        model.load_weights(latest_checkpoint)

model.compile(optimizer='adam', loss=loss)
    
EPOCHS = 1

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states
    
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

spidermoon77 · 30.07.2023

The above example is a basic recurrent network. the code trains the model and saves checkpooints. it uses a shapespear dataset for training.
this is straight from the tensorfow site. But im working on upgrading the script.

It also preprocess the data file.

feel free to test it with files scraped with my webscraper.

Working on threads its very slow till this moment. And sites uses claudflare etc.

sikidok · 30.07.2023

darkcoder_io сказал(а):

а использования качественного антидетекта в коде я не усмотрел

Кстати есть какие то более менее качественные наброски кода puppeteer и antidetect чтобы типa там, загрузка cookies, proxy и так далее.
Может кто уже делал. А то лень велосипед изобретать

darkcoder_io · 30.07.2023

sikidok сказал(а):

Кстати есть какие то более менее качественные наброски кода puppeteer и antidetect чтобы типa там, загрузка cookies, proxy и так далее.
Может кто уже делал. А то лень велосипед изобретать

CDP это более верхний уровень, реальный антидетект закопан глубоко внутри.

sikidok · 30.07.2023

Даже не знал что такое есть, спасибо, буду изучать!

darkcoder_io · 30.07.2023

sikidok сказал(а):

Даже не знал что такое есть, спасибо, буду изучать!

Пиши в жабу, помогу чем смогу.

web scraper for machine learning

spidermoon77

floppy-диск

alex778

(L1) cache

andinov

(L3) cache

spidermoon77

floppy-диск

sikidok

(L2) cache

darkcoder_io

разработка ПО и железа

spidermoon77

floppy-диск

spidermoon77

floppy-диск

sikidok

(L2) cache

darkcoder_io

разработка ПО и железа

sikidok

(L2) cache

darkcoder_io

разработка ПО и железа