|
import json |
|
import os |
|
|
|
import numpy as np |
|
|
|
|
|
class VectorCache: |
|
""" |
|
Caches vectors on disk so one can later build an index on them (indexes like IVF requires big amount of vetores for building) |
|
""" |
|
|
|
def __init__(self, filename='vector_cache.memmap', d=768, size=7000000): |
|
self.filename = filename |
|
self.offset_file = filename + '.offset' |
|
self.d = d |
|
self.size = size |
|
|
|
if os.path.isfile(filename): |
|
mode = 'r+' |
|
self.f = open(self.offset_file, mode) |
|
data = json.load(self.f) |
|
self.offset = data[0] |
|
self.length = data[1] |
|
else: |
|
mode = 'w+' |
|
self.f = open(self.offset_file, mode) |
|
self.offset = 0 |
|
self.length = 0 |
|
|
|
self.db = np.memmap(filename, dtype=np.float32, mode='w+', |
|
shape=(size, d), order='C') |
|
|
|
def sync_offset(self): |
|
self.f.seek(0) |
|
self.f.truncate(0) |
|
self.f.write(json.dumps([self.offset, self.length])) |
|
|
|
def close(self): |
|
self.db.flush() |
|
self.db.close() |
|
|
|
self.sync_offset() |
|
self.f.flush() |
|
self.f.close() |
|
|
|
def add(self, vs): |
|
l = len(vs) |
|
to_end = self.size - self.offset |
|
|
|
if to_end < l: |
|
self.add(vs[:to_end]) |
|
self.add(vs[to_end:]) |
|
return |
|
|
|
self.db[self.offset:self.offset+l+1, :] = vs |
|
self.offset = (self.offset + l + 1) % self.size |
|
self.length = min(self.length + l, self.size) |
|
|