Encode jsonl data as utf8 for gzip write for consistent read/write encoding

Should help with issue #89
This commit is contained in:
Debanjum Singh Solanky 2023-02-12 17:33:23 -06:00
parent c156b3e087
commit 11517ba8eb

View file

@ -51,7 +51,7 @@ def compress_jsonl_data(jsonl_data, output_path):
# Create output directory, if it doesn't exist # Create output directory, if it doesn't exist
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
with gzip.open(output_path, 'wt') as gzip_file: with gzip.open(output_path, 'wt', encoding='utf-8') as gzip_file:
gzip_file.write(jsonl_data) gzip_file.write(jsonl_data)
logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}') logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}')