From a37c48f9699f30201a3e28d24a01b0d634a98592 Mon Sep 17 00:00:00 2001 From: Milena Bajic <mibaj@dtu.dk> Date: Mon, 31 Aug 2020 17:51:57 +0200 Subject: [PATCH] Update transform_to_window_dataset.py --- util_scripts/transform_to_window_dataset.py | 118 ++++++++------------ 1 file changed, 47 insertions(+), 71 deletions(-) diff --git a/util_scripts/transform_to_window_dataset.py b/util_scripts/transform_to_window_dataset.py index 049eaea..a810712 100755 --- a/util_scripts/transform_to_window_dataset.py +++ b/util_scripts/transform_to_window_dataset.py @@ -1,18 +1,25 @@ +""" +A script to apply the sliding window approach on input time series data. Creates fully prepared datasets for analysis. + +@author: Milena Bajic (DTU Compute) +""" + + import sys,os, glob, time +import argparse import pickle import shutil import random import pandas as pd import numpy as np import matplotlib.pyplot as plt -from multiprocessing import Pool, cpu_count -# save def. width, height, severity and speed too class Window_dataset(): def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True): + # Initial processing time t0=time.time() # Get from input @@ -55,28 +62,10 @@ class Window_dataset(): print('Number of split dataframes: {0}'.format(self.n_splits)) print('Number of processes: {0}'.format(self.n_processes)) - # Prepare chunks - #print('Preparing chunks') - #self.chunks = [ self.split_input_dataframes[x:x+self.n_processes] - #for x in range(0, len(self.split_input_dataframes), self.n_processes)] - #print('Number of chunks: {0}'.format( len(self.chunks))) - - - for df_i, df in list(enumerate(self.split_input_dataframes)): print('===> Passing df: ',df_i) df.reset_index(inplace=True, drop=True) self.make_sliding_window_df(df_i, df) - ''' - pool = Pool(processes=self.n_processes) - pass_chunk = list(enumerate(chunk)) - pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ] - print('Chunk data prepared, starting pool map..') - pool.map(self.make_sliding_window_df, pass_chunk) - pool.close() - pool.join() - ''' - #sys.exit(0) dt = round(time.time()-t0,1) print('Time to process: {0} s'.format(dt)) @@ -86,31 +75,23 @@ class Window_dataset(): print('Loading: {0}'.format(filename)) with open(filename, "rb") as f: df = pickle.load(f) - return df def remove_samples_with_zero_counts(self, input_dataframe): - # remove samples with too narrow defects so there is no point 'caught' in type and severity + # Remove samples with too narrow defects so there is no point "caught" in type and severity input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0) input_dataframe = input_dataframe[ input_dataframe['keep']==True ] input_dataframe.drop(['keep'],axis=1, inplace = True) input_dataframe.reset_index(drop=True, inplace=True) - return input_dataframe def make_sliding_window_df(self, df_i, input_dataframe_part): + # Making sliding window (each window: constant in distance, variable length, slide by 1 point) print('Making sliding window') - #try: # parallel mode, pass data in chunks - #chunk_i, df_i, input_dataframe_part = inp - #print('Chunk_i: ', chunk_i) - #print('df_i: ',df_i) - window_df = pd.DataFrame([], columns = self.window_columns) - #print('Data: ',type(input_dataframe_part)) - - # Fill pd with windows from initial one + # Fill Dataframe with windows from initial one for index, row in input_dataframe_part.iterrows(): if (index%500==0): print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0])) @@ -129,8 +110,8 @@ class Window_dataset(): end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 # #print(end_index, row.distance[end_index]) # end index in the whole row (so the last sample is 2m) - # Loop over windows - for i in range(0, end_index+1): # to include the last window + # Loop over the windows + for i in range(0, end_index+1): try: # Get min and max index of this window window_start_meters= row.distance[i] @@ -148,10 +129,8 @@ class Window_dataset(): # Put this window into row df data for col in self.window_columns: - #print('Col: ',col) if col=='window_class': # compute window class column unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect - #print('uniq: ',unique_classes) if len(unique_classes)==1: row_df.at[i,col] = unique_classes[0] elif len(unique_classes)==2: @@ -162,14 +141,12 @@ class Window_dataset(): row_df.at[i,col] = row[col][i: window_end_index] else: row_df.at[i,col] = row[col] #float or string, just repeat - #sys.exit(0) + except: pass - #print(row_df.window_class) return row_df def save_pickle(self, df, out_dir, df_type): - print('Saving {0} as pickle.'.format(df_type)) pickle_name = out_dir+'/'+df_type+'_windows.pkl' df.to_pickle(pickle_name) @@ -177,39 +154,38 @@ class Window_dataset(): return - - -# =========SETUP ============== # -# ============================= # - -cluster = True -is_test = False -filetype = 'train' - #===============================# # ============================= # -# Input dir -# =======# -if cluster: - input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized' -else: - input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized' - -# Output directory -# =======# -out_dir = '{0}-split-into-windows'.format(input_dir) -if not os.path.exists(out_dir): - os.makedirs(out_dir) -if not os.path.exists(out_dir + '/'+str(filetype)): - os.makedirs(out_dir + '/'+str(filetype)) - -# Test -# ======# -if is_test: - test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test) - sys.exit(0) - -# Datasets -# ====================== # -result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test) + +if __name__ == "__main__": + + home = os.path.expanduser('~') + parser = argparse.ArgumentParser(description='Please provide command line arguments.') + parser.add_argument('--test', default = False, + help = 'If test is true, will process 100 rows only (use for testing purposes).') + parser.add_argument('--filetype', default = 'train', + help = 'Choose between train, test or valid. This file will be processed and output created.') + parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test-normalized'.format(home), + help = 'Input directory containing single-defect .mat files.') + parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home), + help='Directory base where a new directory with output files will be created.') + + args = parser.parse_args() + + input_dir = args.input_dir + output_dir_base = args.output_dir_base + is_test = args.test + filetype = args.filetype + + # Make output directory + out_dir = '{0}/train-val-test-normalized-split-into-windows'.format(output_dir_base) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Process + # ======# + if is_test: + result = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test) + else: + result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype)) -- GitLab