From a37c48f9699f30201a3e28d24a01b0d634a98592 Mon Sep 17 00:00:00 2001
From: Milena Bajic <mibaj@dtu.dk>
Date: Mon, 31 Aug 2020 17:51:57 +0200
Subject: [PATCH] Update transform_to_window_dataset.py

---
 util_scripts/transform_to_window_dataset.py | 118 ++++++++------------
 1 file changed, 47 insertions(+), 71 deletions(-)

diff --git a/util_scripts/transform_to_window_dataset.py b/util_scripts/transform_to_window_dataset.py
index 049eaea..a810712 100755
--- a/util_scripts/transform_to_window_dataset.py
+++ b/util_scripts/transform_to_window_dataset.py
@@ -1,18 +1,25 @@
+"""
+A script to apply the sliding window approach on input time series data. Creates fully prepared datasets for analysis.
+
+@author: Milena Bajic (DTU Compute)
+"""
+
+
 import sys,os, glob, time
+import argparse
 import pickle
 import shutil
 import random
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-from multiprocessing import Pool, cpu_count
 
-# save def. width, height, severity and speed too
 
 class Window_dataset():
 
     def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True):
 
+        # Initial processing time
         t0=time.time()
 
         # Get from input
@@ -55,28 +62,10 @@ class Window_dataset():
         print('Number of split dataframes: {0}'.format(self.n_splits))
         print('Number of processes: {0}'.format(self.n_processes))
 
-        # Prepare chunks
-        #print('Preparing chunks')
-        #self.chunks = [ self.split_input_dataframes[x:x+self.n_processes]
-                       #for x in range(0, len(self.split_input_dataframes), self.n_processes)]
-        #print('Number of chunks: {0}'.format( len(self.chunks)))
-
-
-
         for df_i, df in list(enumerate(self.split_input_dataframes)):
             print('===> Passing df: ',df_i)
             df.reset_index(inplace=True, drop=True)
             self.make_sliding_window_df(df_i, df)
-            '''
-            pool = Pool(processes=self.n_processes)
-            pass_chunk = list(enumerate(chunk))
-            pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ]
-            print('Chunk data prepared, starting pool map..')
-            pool.map(self.make_sliding_window_df, pass_chunk)
-            pool.close()
-            pool.join()
-            '''
-            #sys.exit(0)
 
         dt = round(time.time()-t0,1)
         print('Time to process: {0} s'.format(dt))
@@ -86,31 +75,23 @@ class Window_dataset():
         print('Loading: {0}'.format(filename))
         with open(filename, "rb") as f:
             df = pickle.load(f)
-
         return df
 
     def remove_samples_with_zero_counts(self, input_dataframe):
-        # remove samples with too narrow defects so there is no point 'caught' in type and severity
+        # Remove samples with too narrow defects so there is no point "caught" in type and severity
         input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0)
         input_dataframe = input_dataframe[ input_dataframe['keep']==True ]
         input_dataframe.drop(['keep'],axis=1, inplace = True)
         input_dataframe.reset_index(drop=True, inplace=True)
-
         return input_dataframe
 
 
     def make_sliding_window_df(self, df_i, input_dataframe_part):
+        # Making sliding window (each window: constant in distance, variable length, slide by 1 point)
         print('Making sliding window')
-        #try: # parallel mode, pass data in chunks
-            #chunk_i, df_i, input_dataframe_part = inp
-            #print('Chunk_i: ', chunk_i)
-            #print('df_i: ',df_i)
-
         window_df = pd.DataFrame([], columns = self.window_columns)
 
-        #print('Data: ',type(input_dataframe_part))
-
-        # Fill pd with windows from initial one
+        # Fill Dataframe with windows from initial one
         for index, row in input_dataframe_part.iterrows():
             if (index%500==0):
                 print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0]))
@@ -129,8 +110,8 @@ class Window_dataset():
         end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 #
         #print(end_index, row.distance[end_index])   # end index in the whole row (so the last sample is 2m)
 
-        # Loop over windows
-        for i in range(0, end_index+1): # to include the last window
+        # Loop over the windows
+        for i in range(0, end_index+1):
             try:
                 # Get min and max index of this window
                 window_start_meters= row.distance[i]
@@ -148,10 +129,8 @@ class Window_dataset():
 
                 # Put this window into row df data
                 for col in self.window_columns:
-                    #print('Col: ',col)
                     if col=='window_class': # compute window class column
                         unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect
-                        #print('uniq: ',unique_classes)
                         if len(unique_classes)==1:
                            row_df.at[i,col] = unique_classes[0]
                         elif len(unique_classes)==2:
@@ -162,14 +141,12 @@ class Window_dataset():
                         row_df.at[i,col] = row[col][i: window_end_index]
                     else:
                         row_df.at[i,col] = row[col] #float or string, just repeat
-                #sys.exit(0)
+
             except:
                 pass
-        #print(row_df.window_class)
         return row_df
 
     def save_pickle(self, df, out_dir, df_type):
-
         print('Saving {0} as pickle.'.format(df_type))
         pickle_name = out_dir+'/'+df_type+'_windows.pkl'
         df.to_pickle(pickle_name)
@@ -177,39 +154,38 @@ class Window_dataset():
         return
 
 
-
-
-# =========SETUP ============== #
-# ============================= #
-
-cluster = True
-is_test = False
-filetype = 'train'
-
 #===============================#
 # ============================= #
 
-# Input dir
-# =======#
-if cluster:
-    input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized'
-else:
-    input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized'
-
-# Output directory
-# =======#
-out_dir = '{0}-split-into-windows'.format(input_dir)
-if not os.path.exists(out_dir):
-    os.makedirs(out_dir)
-if not os.path.exists(out_dir + '/'+str(filetype)):
-    os.makedirs(out_dir + '/'+str(filetype))
-
-# Test
-# ======#
-if is_test:
-    test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
-    sys.exit(0)
-
-# Datasets
-# ====================== #
-result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test)
+
+if __name__ == "__main__":
+
+    home = os.path.expanduser('~')
+    parser = argparse.ArgumentParser(description='Please provide command line arguments.')
+    parser.add_argument('--test', default = False,
+                        help = 'If test is true, will process 100 rows only (use for testing purposes).')
+    parser.add_argument('--filetype', default = 'train',
+                        help = 'Choose between train, test or valid. This file will be processed and output created.')
+    parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test-normalized'.format(home),
+                        help = 'Input directory containing single-defect .mat files.')
+    parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
+                        help='Directory base where a new directory with output files will be created.')
+
+    args = parser.parse_args()
+
+    input_dir = args.input_dir
+    output_dir_base = args.output_dir_base
+    is_test = args.test
+    filetype = args.filetype
+
+    # Make output directory
+    out_dir = '{0}/train-val-test-normalized-split-into-windows'.format(output_dir_base)
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    # Process
+    # ======#
+    if is_test:
+        result = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
+    else:
+        result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype))
-- 
GitLab