import io
import re
import sys
if len(sys.argv) < 3:
print("Usage: python extractemail.py <input file> <output file>")
sys.exit(1)
INPUT_FILE = sys.argv[1]
OUTPUT_FILE = open(sys.argv[2], 'a')
CHUNK_SIZE = 1024 * 1024 * 10 # 10 MB
pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
def extract_emails(file_path, chunk_size=CHUNK_SIZE):
with io.open(file_path, 'r', encoding='utf-8', errors='ignore') as bigfile:
chunk = ''
for piece in read_in_chunks(bigfile, chunk_size):
chunk += piece
while True:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')
chunk = chunk[m.end():]
else:
break
if chunk:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')
def read_in_chunks(file_object, chunk_size):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
extract_emails(INPUT_FILE)
import re
import sys
if len(sys.argv) < 3:
print("Usage: python extractemail.py <input file> <output file>")
sys.exit(1)
INPUT_FILE = sys.argv[1]
OUTPUT_FILE = open(sys.argv[2], 'a')
CHUNK_SIZE = 1024 * 1024 * 10 # 10 MB
pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
def extract_emails(file_path, chunk_size=CHUNK_SIZE):
with io.open(file_path, 'r', encoding='utf-8', errors='ignore') as bigfile:
chunk = ''
for piece in read_in_chunks(bigfile, chunk_size):
chunk += piece
while True:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')
chunk = chunk[m.end():]
else:
break
if chunk:
m = pattern.search(chunk)
if m:
email = m.group()
OUTPUT_FILE.write(email + '\n')
def read_in_chunks(file_object, chunk_size):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
extract_emails(INPUT_FILE)