Answer the question
In order to leave comments, you need to log in
How to remove duplicates by key in a large file?
There is a json file with 8 million+ lines and 700+ MB of size, in this format:
{'title':'7778', 'mes':'ruseo', 'coord': '755'}
{'title':'77789', 'mes':'ruseo', 'coord': '755'}
{'mes': 'seoru', 'title' : '7778', 'coord' : '-'}
{'mes': 'seoru', 'title' : '7778', 'coord' : '-'}
import json
with open(r'C:\json3toster.json', 'r', encoding="utf-8") as fp:
ds = fp.readlines()
print(len(ds))
mem = []
for record in ds:
name = record.replace('{','').split(',')
for dat in name:
dat2 = dat.split(': ')
if dat2[0] == ' "title"':
newline = dat2[1]
if any(newline in lice for lice in mem):
pass
else:
mem.append(record)
print(len(mem))
for newjs in mem:
with open(r'd:/json_fin.json', 'a', encoding='utf-8') as fg:
fg.write(newjs)
Answer the question
In order to leave comments, you need to log in
You are doing something terrible. Search in the list instead of a set is especially bad. Do this:
from ast import literal_eval as eval # ast.literal_eval() безопасный, обычный eval() - нет
with open('input.txt', 'r') as fi, open('output.txt', 'w') as fo:
cache = set()
for line in fi:
title = eval(line).get('title')
if title not in cache:
cache.add(title)
fo.write(line)
from ast import literal_eval as eval
from functools import lru_cache
@lru_cache(None)
def process(title):
print(record, file=fo)
with open('input.txt', 'r') as fi, open('output.txt', 'w') as fo:
for record in map(eval, fi):
process(record['title'])
>>> process.cache_info()
CacheInfo(hits=994960, misses=5040, maxsize=None, currsize=5040)
as an option you can try this Shit question:
import re
with open(r'C:\json3toster.json', 'r', encoding="utf-8") as fp:
ds = fp.readlines()
d = {"'title'": 0, "'mes'": 1, "'coord'": 2}
print(len(ds))
findall, buf = re.compile(r"'[^']*'").findall, [''] * 3
for i, s in enumerate(ds):
l = findall(s)
while l:
w = l.pop()
buf[d[l.pop()]] = w
ds[i] = '\t'.join(buf)
ds.sort()
a = ''
with open(r'd:/json_fin.json', 'a', encoding='utf-8') as fg:
for s in ds:
title, mes, coord = s.split('\t')
if a != title:
a = title
fg.write(f"{{'title': {title} 'mes': {mes}, 'coord': {coord}}}\n")
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question