Update transform_to_window_dataset.py

a37c48f9 · mibaj · d9be7324 · a37c48f9
Commit a37c48f9 authored Aug 31, 2020 by mibaj
--- a/util_scripts/transform_to_window_dataset.py
+++ b/util_scripts/transform_to_window_dataset.py
+"""
+A script to apply the sliding window approach on input time series data. Creates fully prepared datasets for analysis.
+
+@author: Milena Bajic (DTU Compute)
+"""
+
+
 import sys,os, glob, time
+import argparse
 import pickle
 import shutil
 import random
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-from multiprocessing import Pool, cpu_count

-# save def. width, height, severity and speed too

 class Window_dataset():

    def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True):

+        # Initial processing time
        t0=time.time()

        # Get from input
@@ -55,28 +62,10 @@ class Window_dataset():
        print('Number of split dataframes: {0}'.format(self.n_splits))
        print('Number of processes: {0}'.format(self.n_processes))

-        # Prepare chunks
-        #print('Preparing chunks')
-        #self.chunks = [ self.split_input_dataframes[x:x+self.n_processes]
-                       #for x in range(0, len(self.split_input_dataframes), self.n_processes)]
-        #print('Number of chunks: {0}'.format( len(self.chunks)))
-
-
-
        for df_i, df in list(enumerate(self.split_input_dataframes)):
            print('===> Passing df: ',df_i)
            df.reset_index(inplace=True, drop=True)
            self.make_sliding_window_df(df_i, df)
-            '''
-            pool = Pool(processes=self.n_processes)
-            pass_chunk = list(enumerate(chunk))
-            pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ]
-            print('Chunk data prepared, starting pool map..')
-            pool.map(self.make_sliding_window_df, pass_chunk)
-            pool.close()
-            pool.join()
-            '''
-            #sys.exit(0)

        dt = round(time.time()-t0,1)
        print('Time to process: {0} s'.format(dt))
@@ -86,31 +75,23 @@ class Window_dataset():
        print('Loading: {0}'.format(filename))
        with open(filename, "rb") as f:
            df = pickle.load(f)
-
        return df

    def remove_samples_with_zero_counts(self, input_dataframe):
-        # remove samples with too narrow defects so there is no point 'caught' in type and severity
+        # Remove samples with too narrow defects so there is no point "caught" in type and severity
        input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0)
        input_dataframe = input_dataframe[ input_dataframe['keep']==True ]
        input_dataframe.drop(['keep'],axis=1, inplace = True)
        input_dataframe.reset_index(drop=True, inplace=True)
-
        return input_dataframe


    def make_sliding_window_df(self, df_i, input_dataframe_part):
+        # Making sliding window (each window: constant in distance, variable length, slide by 1 point)
        print('Making sliding window')
-        #try: # parallel mode, pass data in chunks
-            #chunk_i, df_i, input_dataframe_part = inp
-            #print('Chunk_i: ', chunk_i)
-            #print('df_i: ',df_i)
-
        window_df = pd.DataFrame([], columns = self.window_columns)

-        #print('Data: ',type(input_dataframe_part))
-
-        # Fill pd with windows from initial one
+        # Fill Dataframe with windows from initial one
        for index, row in input_dataframe_part.iterrows():
            if (index%500==0):
                print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0]))
@@ -129,8 +110,8 @@ class Window_dataset():
        end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 #
        #print(end_index, row.distance[end_index])   # end index in the whole row (so the last sample is 2m)

-        # Loop over windows
-        for i in range(0, end_index+1): # to include the last window
+        # Loop over the windows
+        for i in range(0, end_index+1):
            try:
                # Get min and max index of this window
                window_start_meters= row.distance[i]
@@ -148,10 +129,8 @@ class Window_dataset():

                # Put this window into row df data
                for col in self.window_columns:
-                    #print('Col: ',col)
                    if col=='window_class': # compute window class column
                        unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect
-                        #print('uniq: ',unique_classes)
                        if len(unique_classes)==1:
                           row_df.at[i,col] = unique_classes[0]
                        elif len(unique_classes)==2:
@@ -162,14 +141,12 @@ class Window_dataset():
                        row_df.at[i,col] = row[col][i: window_end_index]
                    else:
                        row_df.at[i,col] = row[col] #float or string, just repeat
-                #sys.exit(0)
+
            except:
                pass
-        #print(row_df.window_class)
        return row_df

    def save_pickle(self, df, out_dir, df_type):
-
        print('Saving {0} as pickle.'.format(df_type))
        pickle_name = out_dir+'/'+df_type+'_windows.pkl'
        df.to_pickle(pickle_name)
@@ -177,39 +154,38 @@ class Window_dataset():
        return


+#===============================#
+# ============================= #


-# =========SETUP ============== #
-# ============================= #
+if __name__ == "__main__":

-cluster = True
-is_test = False
-filetype = 'train'
+    home = os.path.expanduser('~')
+    parser = argparse.ArgumentParser(description='Please provide command line arguments.')
+    parser.add_argument('--test', default = False,
+                        help = 'If test is true, will process 100 rows only (use for testing purposes).')
+    parser.add_argument('--filetype', default = 'train',
+                        help = 'Choose between train, test or valid. This file will be processed and output created.')
+    parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test-normalized'.format(home),
+                        help = 'Input directory containing single-defect .mat files.')
+    parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
+                        help='Directory base where a new directory with output files will be created.')

-#===============================#
-# ============================= #
+    args = parser.parse_args()

-# Input dir
-# =======#
-if cluster:
-    input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized'
-else:
-    input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized'
+    input_dir = args.input_dir
+    output_dir_base = args.output_dir_base
+    is_test = args.test
+    filetype = args.filetype

-# Output directory
-# =======#
-out_dir = '{0}-split-into-windows'.format(input_dir)
+    # Make output directory
+    out_dir = '{0}/train-val-test-normalized-split-into-windows'.format(output_dir_base)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
-if not os.path.exists(out_dir + '/'+str(filetype)):
-    os.makedirs(out_dir + '/'+str(filetype))

-# Test
+    # Process
    # ======#
    if is_test:
-    test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
-    sys.exit(0)
-
-# Datasets
-# ====================== #
-result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test)
+        result = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
+    else:
+        result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype))