data preparation module

4b45627b · milenabaj · 4b45627b · 4b45627b · 4b45627b · 4b45627b
Commit 4b45627b authored Aug 31, 2020 by milenabaj
--- a/README
+++ b/README
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 31 14:09:35 2020
+
+@author: milena
+"""
+
+Simulation of acceleration signal produced when a car is traversing a road with cracks, potholes and patches.
+Car parameters: Golden car model
+Road length: 100m
+Defect geometries:
+    patch:
+        dmin = 0.005; dmax =   0.05; dincr =  0.005;   % minimum, maximum and increment of severity
+        wmin = 0.250; wmax =  5.00; wincr =  0.0500;   % minimum, maximum and increment of length
+    pothole:
+        dmin = -0.010; dmax = -0.25; dincr = -0.005;   % minimum, maximum and increment of severity
+        wmin =  0.050; wmax =  0.50; wincr =  0.0250;   % minimum, maximum and increment of length
+    crack;
+        dmin = -0.020; dmax = -0.30; dincr = -0.005;   % minimum, maximum and increment of severity
+        wmin =  0.004; wmax =  0.04; wincr =  0.0020;   % minimum, maximum and increment of length
+Speed: 0-120 km/h with step=2km/h
+Sampling frequency: around 300Hz (can differ by few Hzs between examples)
+Cases with too narrow defects to be recorded with this sampling frequency, are removed during preprocessing.
+
+
+Directory contents:
+- Matlab-files: output of simulation module - each .m file corresponds to one defect type, geometry and speed.
+- train-val-test: all matlab files are converted to numpy arrays/floats/string and organized into a pandas and dataframe
+which is saved as a pickle file ('full_simulation.pkl'). Each row in dataframe corresponds to one matlab file. The full file is
+split into 60%/20%/20% train/valid/test files. The too narrow defects without any point are removed.
+- train-val-test-normalized: the train file from train-val-test is scaled to 0-1 range. The scaler is saved and applied to
+valid and test files.
+- train-val-test-normalized-split-into-windows: A sliding window of size=2m and step=1 point is applied on train/valid/test files from train-val-test-normalized.
+The results are saved in this directory. Those are the final results for analysis.
+- scaler.pkl: train scaler file.
\ No newline at end of file
--- a/__init__.py
+++ b/__init__.py
+
--- a/util_scripts/normalize_data.py
+++ b/util_scripts/normalize_data.py
+"""
+A scaling script to be run on prepared train/val/test data.
+
+@author: Milena Bajic (DTU Compute)
+"""
+
+import sys,os, glob
+import argparse
+import pickle
+import shutil
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def save_split_df(df, df_type, out_dir):
+    print('Saving {0} as pickle.'.format(df_type))
+    pickle_name = out_dir+'/'+df_type+'.pkl'
+    df.to_pickle(pickle_name)
+    print('Wrote output file to: ',pickle_name)
+    return df
+
+def load_pickle(input_dir, string):
+    filename = '{0}/{1}.pkl'.format(input_dir, string)
+    print('Loading: {0}'.format(filename))
+    with open(filename, "rb") as f:
+        df = pickle.load(f)
+    return df
+
+def scale_train_df(train_df, cols_to_scale = [' Hastighed [m/s]', ' Acceleration [m/s²]']):
+    print('Scaling train data')
+
+    # Get min and max for full dataset
+    train_df_scaled = pd.DataFrame([], columns = train_df.columns)
+    train_df_maxmin = pd.DataFrame([], columns = train_df.columns)
+
+    for col in train_df.columns:
+       if col in cols_to_scale:
+           # Get parameters
+           col_min = min(train_df[col].apply(lambda row: row.min()))
+           col_max = max(train_df[col].apply(lambda row: row.max()))
+           col_diff = col_max - col_min
+
+           # Scale
+           train_df_scaled[col] = train_df[col].apply(lambda row: (row-col_min)/col_diff)
+
+           # Save scaler info
+           train_df_maxmin.at[0,col] = col_min
+           train_df_maxmin.at[1,col] = col_max
+           train_df_maxmin.at[2,col] = col_diff #difference
+       else:
+            train_df_scaled[col] = train_df[col]
+
+    return train_df_scaled,  train_df_maxmin
+
+
+def scale_non_train_df(df, train_df_maxmin):
+    print('Scaling valid/test data')
+
+    df_scaled = pd.DataFrame([], columns = df.columns)
+
+    for col in df.columns:
+        # Get params
+        col_min = train_df_maxmin.at[0,col]
+        col_max = train_df_maxmin.at[1,col]
+        col_diff = train_df_maxmin.at[2,col] #difference
+
+        # Scale
+        if col_min is np.nan:
+            df_scaled[col] = df[col]
+        else:
+            df_scaled[col] = df[col].apply(lambda row: (row-col_min)/col_diff)
+
+    return df_scaled
+
+
+# ============================= #
+# ============================= #
+if __name__ == "__main__":
+
+    home = os.path.expanduser('~')
+    parser = argparse.ArgumentParser(description='Please provide command line arguments.')
+    parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test'.format(home),
+                        help = 'Input directory containing single-defect .mat files.')
+    parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
+                        help='Directory base where a new directory with output files will be created.')
+
+    args = parser.parse_args()
+
+    input_dir = args.input_dir
+    output_dir_base = args.output_dir_base
+
+    # Make output directory
+    out_dir = '{0}/train-val-test-normalized'.format(output_dir_base)
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    # Load files
+    train = load_pickle(input_dir, 'train')
+    valid = load_pickle(input_dir, 'valid')
+    test = load_pickle(input_dir, 'test')
+
+    # Scale
+    train_scaled, scaler = scale_train_df(train, cols_to_scale = ['acceleration'])
+    valid_scaled = scale_non_train_df(valid,scaler)
+    test_scaled = scale_non_train_df(test, scaler)
+
+    # Save
+    save_split_df(train_scaled, 'train_scaled', out_dir)
+    save_split_df(valid_scaled, 'valid_scaled', out_dir)
+    save_split_df(test_scaled, 'test_scaled', out_dir)
+    save_split_df(scaler, 'scaler', out_dir)
\ No newline at end of file
--- a/util_scripts/process_matlab_files.py
+++ b/util_scripts/process_matlab_files.py
+"""
+A script to prepare full/train/valid/test pickle files with Pandas dataframe containing simulation data for car traversing a road with
+crack/patch/pothole defects, with various car speeds and defect geometries.
+
+@author: Milena Bajic (DTU Compute)
+"""
+
+
+import sys,os, glob
+import pickle
+from scipy.io import loadmat
+import pandas as pd
+import numpy as np
+from sklearn.utils import shuffle
+from sklearn.model_selection import train_test_split
+import argparse
+
+def save_split_df(df, df_type, out_dir):
+    pickle_name = out_dir+'/'+df_type+'.pkl'
+    df.to_pickle(pickle_name)
+    print('Wrote output file to: ',pickle_name)
+    return
+
+
+# ============================= #
+# ============================= #
+
+if __name__ == "__main__":
+
+    home = os.path.expanduser('~')
+    parser = argparse.ArgumentParser(description='Please provide command line arguments.')
+    parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/Matlab-Files'.format(home),
+                        help = 'Input directory containing single-defect .mat files.')
+    parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
+                        help='Directory base where a new directory with output files will be created.')
+
+    args = parser.parse_args()
+
+    input_dir = args.input_dir
+    output_dir_base = args.output_dir_base
+
+    # Make output directory
+    out_dir = '{0}/train-val-test'.format(output_dir_base)
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+
+    # === CONVERT MAT FILES TO ONE PANDAS DATAFRAME AND SAVE === #
+    # ========================================================== #
+
+    # Load mat files
+    file_pattern = '{0}/*.mat'.format(input_dir)
+    df = pd.DataFrame(columns = ['severity', 'type', 'time', 'distance', 'acceleration']) #others are added with append
+    n_files = glob.glob(file_pattern)
+    itr = 0
+    for filename in glob.glob(file_pattern):
+         f = loadmat(filename) # loaded as python dictionary
+         if itr%1000==0:
+             print('N processed files: {0}, currently loaded: {1}'.format(itr, filename))
+
+         remove_keys = ['__header__', '__version__', '__globals__']
+         for key in remove_keys:
+             del f[key]
+
+         for key in f:
+             f[key] = [ f[key].reshape(-1) ]
+
+         # Df for this file
+         df_tmp = pd.DataFrame.from_dict(f)
+
+         # Add info from file name to the dataframe
+         file_info =  filename.split('/')[-1].split('.mat')[0]
+
+         # Add info from filename to df
+         df_tmp['defect_label'] = file_info.split('qcar_AccZ_')[1].split('_')[0]
+         df_tmp['defect_width'] = int(file_info.split('width')[1].split('_mm')[0])
+         df_tmp['defect_height'] = int(file_info.split('depth')[1].split('_mm')[0])
+         df_tmp['speed'] = int(file_info.split('speed')[1].split('_kmh')[0])
+         df_tmp['sampling_freq'] = int(file_info.split('rate')[1].split('_Hz')[0])
+
+         # Save full filename too
+         df_tmp['filename'] = filename.split('/')[-1]
+
+         # Append dataframe for this file to the combined one
+         df = df.append(df_tmp) # each file data is now a row
+
+         itr = itr + 1
+
+    # Reset index of the final dataframe
+    df.reset_index(inplace=True, drop=True)
+
+    # Dump dataframe to a pkl file
+    pickle_name = '{0}/simulation_full.pkl'.format(out_dir)
+    with open(pickle_name, 'wb') as outfile:
+         pickle.dump(df, outfile, pickle.HIGHEST_PROTOCOL)
+         print('Wrote output file to: ',pickle_name)
+
+
+    # === SPLIT DATAFRAME INTO TRAIN/VALID/TEST AND SAVE === #
+    # ====================================================== #
+    test_size=0.2
+    val_size=0.2
+    trainval, test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True)
+    train, val = train_test_split(trainval,  test_size=val_size/(1-test_size), random_state=11)
+
+    train.reset_index(inplace=True, drop=True)
+    val.reset_index(inplace=True, drop=True)
+    test.reset_index(inplace=True, drop=True)
+
+    save_split_df(train, 'train', out_dir)
+    save_split_df(val, 'valid', out_dir)
+    save_split_df(test, 'test', out_dir)
+
+
+
+
+
+
--- a/util_scripts/transform_to_window_dataset.py
+++ b/util_scripts/transform_to_window_dataset.py
+import sys,os, glob, time
+import pickle
+import shutil
+import random
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from multiprocessing import Pool, cpu_count
+
+# save def. width, height, severity and speed too
+
+class Window_dataset():
+
+    def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True):
+
+        t0=time.time()
+
+        # Get from input
+        self.input_dir = input_dir
+        self.out_dir = out_dir
+        self.filestring = filestring
+        self.win_size = win_size
+        self.test = is_test
+        self.n_processes = 1 #cpu_count()
+
+        # Load pickle
+        self.input_dataframe = self.load_pickle(input_dir, filestring)
+
+        # Remove rows with 0 points recorded, n_points[s] = 3.6*fs*defect_width/v[km/h]
+        if self.test:
+            self.input_dataframe = self.remove_samples_with_zero_counts(self.input_dataframe).head(100)
+            self.n_split_rows_length = 20
+        else:
+            self.input_dataframe = self.remove_samples_with_zero_counts(self.input_dataframe)
+            self.n_split_rows_length = 1000
+
+        # Take only needed columns
+        self.input_columns = ['time','distance','speed', 'acceleration', 'severity', 'type', 'defect_width', 'defect_height']
+        self.deciding_column = 'type'
+        self.filestring = self.filestring
+        self.input_dataframe = self.input_dataframe[self.input_columns]
+
+        # Window columns to save
+        self.window_columns = [col for col in self.input_columns if col!=('distance')]
+        self.window_columns.append('window_class')
+
+        # Split input df into smaller ones (to run in parallel on split dataframes)
+        print('Making split dataframes')
+        self.n_input_rows = self.input_dataframe.shape[0]
+        self.last_split = int(self.n_input_rows/self.n_split_rows_length)
+        self.index_list =  [n*self.n_split_rows_length for n in range(1,self.last_split+1)]
+        self.split_input_dataframes = np.split(self.input_dataframe, self.index_list)
+        self.n_splits = len(self.split_input_dataframes)
+
+        print('Number of split dataframes: {0}'.format(self.n_splits))
+        print('Number of processes: {0}'.format(self.n_processes))
+
+        # Prepare chunks
+        #print('Preparing chunks')
+        #self.chunks = [ self.split_input_dataframes[x:x+self.n_processes]
+                       #for x in range(0, len(self.split_input_dataframes), self.n_processes)]
+        #print('Number of chunks: {0}'.format( len(self.chunks)))
+
+
+
+        for df_i, df in list(enumerate(self.split_input_dataframes)):
+            print('===> Passing df: ',df_i)
+            df.reset_index(inplace=True, drop=True)
+            self.make_sliding_window_df(df_i, df)
+            '''
+            pool = Pool(processes=self.n_processes)
+            pass_chunk = list(enumerate(chunk))
+            pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ]
+            print('Chunk data prepared, starting pool map..')
+            pool.map(self.make_sliding_window_df, pass_chunk)
+            pool.close()
+            pool.join()
+            '''
+            #sys.exit(0)
+
+        dt = round(time.time()-t0,1)
+        print('Time to process: {0} s'.format(dt))
+
+    def load_pickle(self, input_dir, string):
+        filename = '{0}/{1}_scaled.pkl'.format(input_dir, string)
+        print('Loading: {0}'.format(filename))
+        with open(filename, "rb") as f:
+            df = pickle.load(f)
+
+        return df
+
+    def remove_samples_with_zero_counts(self, input_dataframe):
+        # remove samples with too narrow defects so there is no point 'caught' in type and severity
+        input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0)
+        input_dataframe = input_dataframe[ input_dataframe['keep']==True ]
+        input_dataframe.drop(['keep'],axis=1, inplace = True)
+        input_dataframe.reset_index(drop=True, inplace=True)
+
+        return input_dataframe
+
+
+    def make_sliding_window_df(self, df_i, input_dataframe_part):
+        print('Making sliding window')
+        #try: # parallel mode, pass data in chunks
+            #chunk_i, df_i, input_dataframe_part = inp
+            #print('Chunk_i: ', chunk_i)
+            #print('df_i: ',df_i)
+
+        window_df = pd.DataFrame([], columns = self.window_columns)
+
+        #print('Data: ',type(input_dataframe_part))
+
+        # Fill pd with windows from initial one
+        for index, row in input_dataframe_part.iterrows():
+            if (index%500==0):
+                print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0]))
+            row_df = self.make_sliding_window_row(row)
+            window_df = window_df.append(row_df)
+
+        window_df.reset_index(inplace=True, drop=True)
+
+        # Save pickle
+        self.save_pickle(window_df, self.out_dir, self.filestring+'_'+ str(df_i))
+        return
+
+    def make_sliding_window_row(self, row):
+        row_df = pd.DataFrame([], columns = self.window_columns)
+
+        end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 #
+        #print(end_index, row.distance[end_index])   # end index in the whole row (so the last sample is 2m)
+
+        # Loop over windows
+        for i in range(0, end_index+1): # to include the last window
+            try:
+                # Get min and max index of this window
+                window_start_meters= row.distance[i]
+                window_end_meters= window_start_meters + self.win_size
+                window_end_index = np.where(row.distance>window_end_meters)[0][0]
+                #print(i, window_end_index, window_start_meters, window_end_meters)
+
+                # If the window is fully flat, add with a small prob. equal to how probable is each defect
+                window_is_flat = np.all(row[self.deciding_column][i: window_end_index]==0)
+                remove_window = False
+                if window_is_flat:
+                   remove_window = random.randrange(100)<99  # keep with 2% probability
+                if remove_window:
+                    continue
+
+                # Put this window into row df data
+                for col in self.window_columns:
+                    #print('Col: ',col)
+                    if col=='window_class': # compute window class column
+                        unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect
+                        #print('uniq: ',unique_classes)
+                        if len(unique_classes)==1:
+                           row_df.at[i,col] = unique_classes[0]
+                        elif len(unique_classes)==2:
+                            row_df.at[i,col] = list(filter(lambda c: c!=0, unique_classes ))[0]
+                        else:
+                            raise Error("More than 1 defect per window not implemented.")
+                    elif isinstance(row[col],np.ndarray): # fill numpy array columns
+                        row_df.at[i,col] = row[col][i: window_end_index]
+                    else:
+                        row_df.at[i,col] = row[col] #float or string, just repeat
+                #sys.exit(0)
+            except:
+                pass
+        #print(row_df.window_class)
+        return row_df
+
+    def save_pickle(self, df, out_dir, df_type):
+
+        print('Saving {0} as pickle.'.format(df_type))
+        pickle_name = out_dir+'/'+df_type+'_windows.pkl'
+        df.to_pickle(pickle_name)
+        print('Wrote output file to: ',pickle_name)
+        return
+
+
+
+
+# =========SETUP ============== #
+# ============================= #
+
+cluster = True
+is_test = False
+filetype = 'train'
+
+#===============================#
+# ============================= #
+
+# Input dir
+# =======#
+if cluster:
+    input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized'
+else:
+    input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized'
+
+# Output directory
+# =======#
+out_dir = '{0}-split-into-windows'.format(input_dir)
+if not os.path.exists(out_dir):
+    os.makedirs(out_dir)
+if not os.path.exists(out_dir + '/'+str(filetype)):
+    os.makedirs(out_dir + '/'+str(filetype))
+
+# Test
+# ======#
+if is_test:
+    test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
+    sys.exit(0)
+
+# Datasets
+# ====================== #
+result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test)