I'm making a web scraper for machine learning models like NLP. I plan to update the script regularly, It is pretty slow because of mutiple requests etc. so, shen testing use low scrape count [0-90]
Python:
import requests
import os
import hashlib,os
import re, signal, sys
import urllib3
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import codecs
def createFolders(urls, scraped, root_folder):
os.makedirs(root_folder, exist_ok=True) # Create the root folder
for i in range(len(urls)):
subfolder_name = str(i)
subfolder_path = os.path.join(root_folder, subfolder_name)
os.makedirs(subfolder_path, exist_ok=True) # Create the subfolder
file_path = os.path.join(subfolder_path, subfolder_name + ".txt")
try:
with open(file_path, "w", encoding="utf-8") as file:
file.write(str(scraped[i]))
except UnicodeEncodeError:
print(f"Skipping file {file_path} due to UnicodeEncodeError.")
def scrapeContent(url):
global scraped
try:
response = requests.get(url)
content = bs(response.content, "lxml")
return content
except requests.exceptions.InvalidURL as e:
print("Skipping URL because " + str(e))
except requests.exceptions.RequestException as e:
print("Skipping URL because " + str(e))
except requests.exceptions.InvalidSchema as e:
print("Skipping URL because " + str(e))
def getScrapeList(url):
try:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
response = requests.get(url, verify=False)
content = bs(response.content, "lxml")
a_array = []
hrefs = content.find_all("a")
for href in hrefs:
a = href.get("href")
if a is not None:
if a.startswith("http"):
True
else:
a = urljoin(url, a)
a_array.append(a)
return a_array
except requests.exceptions.InvalidURL as e:
print("Skipping URL because " + str(e))
except requests.exceptions.RequestException as e:
print("Skipping URL because " + str(e))
except requests.exceptions.InvalidSchema as e:
print("Skipping URL because " + str(e))
def signal_handler(signal, frame):
print("\nProgram interrupted.")
# Perform cleanup actions here, if needed
sys.exit(0)
def main():
# Set the custom signal handler for Ctrl+C
signal.signal(signal.SIGINT, signal_handler)
print("---Webscraper v0---")
print("------------")
print("PRESS Ctrl+C to stop scraping")
print("------------")
try:
url = input("Enter URL: ")
root_folder = input("Enter root folder name: ")
maxurl = int(input("Enter scrape count: "))
print("Scanning for URLs to scrape...")
# Creating initial scrapelist
scrapelist = list(set(getScrapeList(url)))
# Check every URL for adding to scrapelist
for u in scrapelist:
if maxcurl > 0:
maxurl=maxurl-1
urls = getScrapeList(u)
if urls is not None:
scrapelist = list(set(scrapelist + urls))
else:
break
print("Total amount of URLs found: " + str(len(scrapelist)))
print("Scraping found URLs")
# Get the webpage content
scraped = []
for u in scrapelist:
scraped.append(scrapeContent(u))
print("Done scraping")
print("Saving data in folders...")
# Create folders with webpage title as folder name
createFolders(scrapelist, scraped, root_folder)
print("Succesfully scraped content")
except KeyboardInterrupt:
# Handle the user interrupt
print("\nProgram interrupted.")
# Perform cleanup actions here, if needed
sys.exit(0)
main()