• XSS.stack #1 – первый литературный журнал от юзеров форума

[PYTHON] Extract Email from your BIGASS TXT FILE (UP TO TeraByte)

redrooom

CD-диск
Пользователь
Регистрация
17.07.2023
Сообщения
14
Реакции
11
import io
import re
import sys

if len(sys.argv) < 3:
print("Usage: python extractemail.py <input file> <output file>")
sys.exit(1)

INPUT_FILE = sys.argv[1]
OUTPUT_FILE = open(sys.argv[2], 'a')
CHUNK_SIZE = 1024 * 1024 * 10 # 10 MB

pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

def extract_emails(file_path, chunk_size=CHUNK_SIZE):
with io.open(file_path, 'r', encoding='utf-8', errors='ignore') as bigfile:
chunk = ''
for piece in read_in_chunks(bigfile, chunk_size):
chunk += piece
while True:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')
chunk = chunk[m.end():]
else:
break
if chunk:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')

def read_in_chunks(file_object, chunk_size):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data

extract_emails(INPUT_FILE)
 
Python:
import re

file_source = 'file.txt'
file_save = 'result.txt'
regex_email = re.compile('[a-z0-9_-]{3,64}@[a-z0-9_-]{3,64}\.[a-z]{2,5}', re.IGNORECASE)

with open(file_save, 'a+', encoding='utf-8') as sf:
    with open(file_source, 'r', encoding='utf-8') as lines:
        for line in lines:
            if re.match(regex_email, line):
                emails = re.findall(regex_email, line)
                for email in emails:
                    print('find email: ', email)
                    sf.write(line)
 
Python:
import re

file_source = 'file.txt'
file_save = 'result.txt'
regex_email = re.compile('[a-z0-9_-]{3,64}@[a-z0-9_-]{3,64}\.[a-z]{2,5}', re.IGNORECASE)

with open(file_save, 'a+', encoding='utf-8') as sf:
    with open(file_source, 'r', encoding='utf-8') as lines:
        for line in lines:
            if re.match(regex_email, line):
                emails = re.findall(regex_email, line)
                for email in emails:
                    print('find email: ', email)
                    sf.write(line)
Код:
import this; this.s[:99]
 
Пожалуйста, обратите внимание, что пользователь заблокирован
While this works, sure. It'd horrible to use Python for it. Use a CLang.

Threw 500gb of data using regex with python, took about 7 hours on a HDD. After I rewrote it in C, 1 minute, C#, 8 minutes.
 


Напишите ответ...
  • Вставить:
Прикрепить файлы
Верх