wip

GreenWizard2015 · GreenWizard2015 · commit 2bfcadaf373b · 2024-10-19T21:37:48.000+02:00
diff --git a/Core/CBaseModel.py b/Core/CBaseModel.py
@@ -22,7 +22,7 @@ def replaceByEmbeddings(self, data):
 
   def _modelFilename(self, folder, postfix=''):
     postfix = '-' + postfix if postfix else ''
-    return os.path.join(folder, '%s%s.h5' % (self._modelID, postfix))
+    return os.path.join(folder, '%s%s.h5' % (self._model, postfix))
   
   def save(self, folder=None, postfix=''):
     path = self._modelFilename(folder, postfix)
diff --git a/Core/CDatasetLoader.py b/Core/CDatasetLoader.py
@@ -1,50 +1,47 @@
 import Core.Utils as Utils
-import os, glob
+import os
 from Core.CSamplesStorage import CSamplesStorage
-from Core.CDataSampler import CDataSampler
 import numpy as np
-import tensorflow as tf
 from enum import Enum
 
 class ESampling(Enum):
   AS_IS = 'as_is'
   UNIFORM = 'uniform'
   
 class CDatasetLoader:
-  def __init__(self, folder, samplerArgs, sampling, stats, sampler_class):
-    # recursively find all 'train.npz' files
-    trainFiles = glob.glob(os.path.join(folder, '**', 'train.npz'), recursive=True)
-    if 0 == len(trainFiles):
-      raise Exception('No training dataset found in "%s"' % (folder, ))
-      exit(1)
-    
-    print('Found %d training datasets' % (len(trainFiles), ))
-
+  def __init__(self, folder, samplerArgs, sampling, stats, sampler_class, test_folders):
     self._datasets = []
-    for trainFile in trainFiles:
-      print('Loading %s' % (trainFile, ))
-      # extract the placeId, userId, and screenId
-      parts = os.path.split(trainFile)[0].split(os.path.sep)
-      placeId, userId, screenId = parts[-3], parts[-2], parts[-1]
-      ds = sampler_class(
-        CSamplesStorage(
-          placeId=stats['placeId'].index(placeId),
-          userId=stats['userId'].index(userId),
-          screenId=stats['screenId'].index('%s/%s' % (placeId, screenId))
-        ),
-        **samplerArgs
-      )
-      ds.addBlock(Utils.datasetFrom(trainFile))
-      self._datasets.append(ds)
-      continue
+    for datasetFolder, ID in Utils.dataset_from_stats(stats, folder):
+      (place_id_index, user_id_index, screen_id_index) = ID
+      for test_folder in test_folders:
+        dataset = os.path.join(datasetFolder, test_folder)
+        if not os.path.exists(dataset):
+          continue
+        print('Loading %s' % (dataset, ))
+        print(f'ID: {ID}. Index: {1 + len(self._datasets)}')
+        ds = sampler_class(
+          CSamplesStorage(
+            placeId=place_id_index,
+            userId=user_id_index,
+            screenId=screen_id_index,
+          ),
+          **samplerArgs
+        )
+        ds.addBlock(Utils.datasetFrom(dataset))
+        self._datasets.append(ds)
+
+    if 0 == len(self._datasets):
+      raise Exception('No training dataset found in "%s"' % (folder, ))
     
-    print('Loaded %d datasets' % (len(self._datasets), ))
     validSamples = {
       i: len(ds.validSamples())
       for i, ds in enumerate(self._datasets)
     }
     # ignore datasets with no valid samples
     validSamples = {k: v for k, v in validSamples.items() if 0 < v}
+
+    print('Loaded %d datasets with %d valid samples' % (len(self._datasets), sum(validSamples.values())))
+
     dtype = np.uint8 if len(self._datasets) < 256 else np.uint32
     # create an array of dataset indices to sample from
     sampling = ESampling(sampling)
diff --git a/Core/CInpaintingTrainer.py b/Core/CInpaintingTrainer.py
@@ -40,6 +40,9 @@ def __init__(self, timesteps, model='simple', KP=5, **kwargs):
       self._eval,
       input_signature=[specification]
     )
+
+    if 'weights' in kwargs:
+      self.load(**kwargs['weights'])
     return
   
   def compile(self):
@@ -94,4 +97,10 @@ def _eval(self, xy):
 
   def eval(self, data):
     loss = self._eval(data)
-    return loss.numpy()
+    return loss.numpy()
+    
+  def save(self, folder=None, postfix=''):
+    self._model.save(folder=folder, postfix=postfix)
+
+  def load(self, folder=None, postfix='', embeddings=False):
+    self._model.load(folder=folder, postfix=postfix, embeddings=embeddings)
diff --git a/Core/CTestInpaintingLoader.py b/Core/CTestInpaintingLoader.py
@@ -11,6 +11,14 @@ def __init__(self, testFolder):
     self.on_epoch_end()
     return
   
+  @lru_cache(maxsize=1)
+  def parametersIDs(self):
+    batch, _ = self[0]
+    userId = batch['userId'][0, 0, 0]
+    placeId = batch['placeId'][0, 0, 0]
+    screenId = batch['screenId'][0, 0, 0]
+    return placeId, userId, screenId
+    
   def on_epoch_end(self):
     return
 
diff --git a/Core/CTestLoader.py b/Core/CTestLoader.py
@@ -10,7 +10,15 @@ def __init__(self, testFolder):
     ]
     self.on_epoch_end()
     return
-  
+
+  @lru_cache(maxsize=1)
+  def parametersIDs(self):
+    batch, _ = self[0]
+    userId = batch['userId'][0, 0, 0]
+    placeId = batch['placeId'][0, 0, 0]
+    screenId = batch['screenId'][0, 0, 0]
+    return placeId, userId, screenId
+      
   def on_epoch_end(self):
     return
 
diff --git a/Core/Utils.py b/Core/Utils.py
@@ -297,4 +297,25 @@ def countSamplesIn(folder):
     with np.load(fn) as data:
       res += len(data['time'])
     continue
-  return res
+  return res
+
+def dataset_from_stats(stats, folder):
+  userId = stats['userId']
+  placeId = stats['placeId']
+  screenId = stats['screenId']
+  # screenId is a concatenation of placeId and screenId, to make it unique pair
+  PlaceAndScreenId = [x.split('/') for x in screenId]
+
+  blackList = set(stats.get('blacklist', []))
+  known = set([tuple(x) for x in blackList])
+  for screen_id_index, (place_id, screen_id) in enumerate(PlaceAndScreenId):
+    place_id_index = placeId.index(place_id)
+    # find user_id among all
+    for user_id_index, user_id in enumerate(userId):
+      datasetFolder = os.path.join(folder, place_id, user_id, screen_id)
+      if not os.path.exists(datasetFolder): continue
+      ID = (place_id_index, user_id_index, screen_id_index)
+      if ID in known: continue
+      known.add(ID)
+
+      yield (datasetFolder, ID)
diff --git a/NN/networks.py b/NN/networks.py
@@ -351,7 +351,7 @@ def transformLatents(x):
   # two eyes
   eyesN = eyeSize * eyeSize
   eyes = sMLP(sizes=[eyesN] * 2, activation='relu')(latents)
-  eyes = L.Dense(eyesN * 2)(eyes)
+  eyes = L.Dense(eyesN * 2, 'sigmoid')(eyes)
   eyes = L.Reshape((-1, eyeSize, eyeSize, 2))(eyes)
   # face points
   face = sMLP(sizes=[pointsN] * 2, activation='relu')(latents)
diff --git a/scripts/check-dataset.py b/scripts/check-dataset.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-.
+'''
+This script is load one by one the datasets and check how many unique samples are there
+'''
+import argparse, os, sys
+# add the root folder of the project to the path
+ROOT_FOLDER = os.path.abspath(os.path.dirname(__file__) + '/../')
+sys.path.append(ROOT_FOLDER)
+
+from Core.CDataSamplerInpainting import CDataSamplerInpainting
+from Core.CDataSampler import CDataSampler
+import Core.Utils as Utils
+import json
+from Core.CSamplesStorage import CSamplesStorage
+
+def samplesStream(params, filename, ID, batch_size, is_inpainting):
+  placeId, userId, screenId = ID
+  storage = CSamplesStorage(placeId=placeId, userId=userId, screenId=screenId)
+  if is_inpainting:
+    ds = CDataSamplerInpainting(
+      storage,
+      defaults=params, 
+      batch_size=batch_size, minFrames=params['timesteps'],
+      keys=['clean']
+    )
+  else:
+    ds = CDataSampler(
+      storage,
+      defaults=params, 
+      batch_size=batch_size, minFrames=params['timesteps'],
+    )
+  ds.addBlock(Utils.datasetFrom(filename))
+  
+  N = ds.totalSamples
+  for i in range(0, N, batch_size):
+    indices = list(range(i, min(i + batch_size, N)))
+    batch, rejected, accepted = ds.sampleByIds(indices)
+    if batch is None: continue
+
+    # main batch
+    x, y = batch
+    if not is_inpainting:
+      x = x['clean']
+    for idx in range(len(x['points'])):
+      yield idx
+  return
+
+def main(args):
+  params = dict(
+    timesteps=args.steps,
+    stepsSampling='uniform',
+    # no augmentations by default
+    pointsNoise=0.0, pointsDropout=0.0,
+    eyesDropout=0.0, eyesAdditiveNoise=0.0, brightnessFactor=1.0, lightBlobFactor=1.0,
+    targets=dict(keypoints=3, total=10),
+  )
+  folder = os.path.join(args.folder, 'Data', 'remote')
+
+  stats = None
+  with open(os.path.join(folder, 'stats.json'), 'r') as f:
+    stats = json.load(f)
+
+  # enable all disabled datasets
+  stats['blacklist'] = []
+  for datasetFolder, ID in Utils.dataset_from_stats(stats, folder):
+    trainFile = os.path.join(datasetFolder, 'train.npz')
+    if not os.path.exists(trainFile):
+      continue
+    print('Processing', trainFile)
+
+    stream = samplesStream(params, trainFile, ID=ID, batch_size=64, is_inpainting=args.inpainting)
+    samplesN = 0
+    for _ in stream:
+      samplesN += 1
+      continue
+    print(f'Dataset has {samplesN} valid samples')
+    if samplesN <= args.min_samples:
+      print(f'Warning: dataset has less or equal to {args.min_samples} samples and will be disabled')
+      stats['blacklist'].append(ID)
+
+  with open(os.path.join(folder, 'stats.json'), 'w') as f:
+    json.dump(stats, f, indent=2, sort_keys=True, default=str)
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--steps', type=int, default=5)
+  parser.add_argument('--folder', type=str, default=ROOT_FOLDER)
+  parser.add_argument('--min-samples', type=int, default=0)
+  parser.add_argument('--inpainting', action='store_true', default=False)
+  main(parser.parse_args())
+  pass
diff --git a/scripts/create-test-dataset-inpainting.py b/scripts/create-test-dataset-inpainting.py
diff --git a/scripts/train-reconstruction.py b/scripts/train-reconstruction.py