Skip to content
Snippets Groups Projects
Commit 4b45627b authored by milenabaj's avatar milenabaj
Browse files

data preparation module

parents
No related branches found
No related tags found
No related merge requests found
README 0 → 100755
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 31 14:09:35 2020
@author: milena
"""
Simulation of acceleration signal produced when a car is traversing a road with cracks, potholes and patches.
Car parameters: Golden car model
Road length: 100m
Defect geometries:
patch:
dmin = 0.005; dmax = 0.05; dincr = 0.005; % minimum, maximum and increment of severity
wmin = 0.250; wmax = 5.00; wincr = 0.0500; % minimum, maximum and increment of length
pothole:
dmin = -0.010; dmax = -0.25; dincr = -0.005; % minimum, maximum and increment of severity
wmin = 0.050; wmax = 0.50; wincr = 0.0250; % minimum, maximum and increment of length
crack;
dmin = -0.020; dmax = -0.30; dincr = -0.005; % minimum, maximum and increment of severity
wmin = 0.004; wmax = 0.04; wincr = 0.0020; % minimum, maximum and increment of length
Speed: 0-120 km/h with step=2km/h
Sampling frequency: around 300Hz (can differ by few Hzs between examples)
Cases with too narrow defects to be recorded with this sampling frequency, are removed during preprocessing.
Directory contents:
- Matlab-files: output of simulation module - each .m file corresponds to one defect type, geometry and speed.
- train-val-test: all matlab files are converted to numpy arrays/floats/string and organized into a pandas and dataframe
which is saved as a pickle file ('full_simulation.pkl'). Each row in dataframe corresponds to one matlab file. The full file is
split into 60%/20%/20% train/valid/test files. The too narrow defects without any point are removed.
- train-val-test-normalized: the train file from train-val-test is scaled to 0-1 range. The scaler is saved and applied to
valid and test files.
- train-val-test-normalized-split-into-windows: A sliding window of size=2m and step=1 point is applied on train/valid/test files from train-val-test-normalized.
The results are saved in this directory. Those are the final results for analysis.
- scaler.pkl: train scaler file.
\ No newline at end of file
"""
A scaling script to be run on prepared train/val/test data.
@author: Milena Bajic (DTU Compute)
"""
import sys,os, glob
import argparse
import pickle
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def save_split_df(df, df_type, out_dir):
print('Saving {0} as pickle.'.format(df_type))
pickle_name = out_dir+'/'+df_type+'.pkl'
df.to_pickle(pickle_name)
print('Wrote output file to: ',pickle_name)
return df
def load_pickle(input_dir, string):
filename = '{0}/{1}.pkl'.format(input_dir, string)
print('Loading: {0}'.format(filename))
with open(filename, "rb") as f:
df = pickle.load(f)
return df
def scale_train_df(train_df, cols_to_scale = [' Hastighed [m/s]', ' Acceleration [m/s²]']):
print('Scaling train data')
# Get min and max for full dataset
train_df_scaled = pd.DataFrame([], columns = train_df.columns)
train_df_maxmin = pd.DataFrame([], columns = train_df.columns)
for col in train_df.columns:
if col in cols_to_scale:
# Get parameters
col_min = min(train_df[col].apply(lambda row: row.min()))
col_max = max(train_df[col].apply(lambda row: row.max()))
col_diff = col_max - col_min
# Scale
train_df_scaled[col] = train_df[col].apply(lambda row: (row-col_min)/col_diff)
# Save scaler info
train_df_maxmin.at[0,col] = col_min
train_df_maxmin.at[1,col] = col_max
train_df_maxmin.at[2,col] = col_diff #difference
else:
train_df_scaled[col] = train_df[col]
return train_df_scaled, train_df_maxmin
def scale_non_train_df(df, train_df_maxmin):
print('Scaling valid/test data')
df_scaled = pd.DataFrame([], columns = df.columns)
for col in df.columns:
# Get params
col_min = train_df_maxmin.at[0,col]
col_max = train_df_maxmin.at[1,col]
col_diff = train_df_maxmin.at[2,col] #difference
# Scale
if col_min is np.nan:
df_scaled[col] = df[col]
else:
df_scaled[col] = df[col].apply(lambda row: (row-col_min)/col_diff)
return df_scaled
# ============================= #
# ============================= #
if __name__ == "__main__":
home = os.path.expanduser('~')
parser = argparse.ArgumentParser(description='Please provide command line arguments.')
parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test'.format(home),
help = 'Input directory containing single-defect .mat files.')
parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
help='Directory base where a new directory with output files will be created.')
args = parser.parse_args()
input_dir = args.input_dir
output_dir_base = args.output_dir_base
# Make output directory
out_dir = '{0}/train-val-test-normalized'.format(output_dir_base)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# Load files
train = load_pickle(input_dir, 'train')
valid = load_pickle(input_dir, 'valid')
test = load_pickle(input_dir, 'test')
# Scale
train_scaled, scaler = scale_train_df(train, cols_to_scale = ['acceleration'])
valid_scaled = scale_non_train_df(valid,scaler)
test_scaled = scale_non_train_df(test, scaler)
# Save
save_split_df(train_scaled, 'train_scaled', out_dir)
save_split_df(valid_scaled, 'valid_scaled', out_dir)
save_split_df(test_scaled, 'test_scaled', out_dir)
save_split_df(scaler, 'scaler', out_dir)
\ No newline at end of file
"""
A script to prepare full/train/valid/test pickle files with Pandas dataframe containing simulation data for car traversing a road with
crack/patch/pothole defects, with various car speeds and defect geometries.
@author: Milena Bajic (DTU Compute)
"""
import sys,os, glob
import pickle
from scipy.io import loadmat
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import argparse
def save_split_df(df, df_type, out_dir):
pickle_name = out_dir+'/'+df_type+'.pkl'
df.to_pickle(pickle_name)
print('Wrote output file to: ',pickle_name)
return
# ============================= #
# ============================= #
if __name__ == "__main__":
home = os.path.expanduser('~')
parser = argparse.ArgumentParser(description='Please provide command line arguments.')
parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/Matlab-Files'.format(home),
help = 'Input directory containing single-defect .mat files.')
parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
help='Directory base where a new directory with output files will be created.')
args = parser.parse_args()
input_dir = args.input_dir
output_dir_base = args.output_dir_base
# Make output directory
out_dir = '{0}/train-val-test'.format(output_dir_base)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# === CONVERT MAT FILES TO ONE PANDAS DATAFRAME AND SAVE === #
# ========================================================== #
# Load mat files
file_pattern = '{0}/*.mat'.format(input_dir)
df = pd.DataFrame(columns = ['severity', 'type', 'time', 'distance', 'acceleration']) #others are added with append
n_files = glob.glob(file_pattern)
itr = 0
for filename in glob.glob(file_pattern):
f = loadmat(filename) # loaded as python dictionary
if itr%1000==0:
print('N processed files: {0}, currently loaded: {1}'.format(itr, filename))
remove_keys = ['__header__', '__version__', '__globals__']
for key in remove_keys:
del f[key]
for key in f:
f[key] = [ f[key].reshape(-1) ]
# Df for this file
df_tmp = pd.DataFrame.from_dict(f)
# Add info from file name to the dataframe
file_info = filename.split('/')[-1].split('.mat')[0]
# Add info from filename to df
df_tmp['defect_label'] = file_info.split('qcar_AccZ_')[1].split('_')[0]
df_tmp['defect_width'] = int(file_info.split('width')[1].split('_mm')[0])
df_tmp['defect_height'] = int(file_info.split('depth')[1].split('_mm')[0])
df_tmp['speed'] = int(file_info.split('speed')[1].split('_kmh')[0])
df_tmp['sampling_freq'] = int(file_info.split('rate')[1].split('_Hz')[0])
# Save full filename too
df_tmp['filename'] = filename.split('/')[-1]
# Append dataframe for this file to the combined one
df = df.append(df_tmp) # each file data is now a row
itr = itr + 1
# Reset index of the final dataframe
df.reset_index(inplace=True, drop=True)
# Dump dataframe to a pkl file
pickle_name = '{0}/simulation_full.pkl'.format(out_dir)
with open(pickle_name, 'wb') as outfile:
pickle.dump(df, outfile, pickle.HIGHEST_PROTOCOL)
print('Wrote output file to: ',pickle_name)
# === SPLIT DATAFRAME INTO TRAIN/VALID/TEST AND SAVE === #
# ====================================================== #
test_size=0.2
val_size=0.2
trainval, test = train_test_split(df, test_size=test_size, random_state=11, shuffle=True)
train, val = train_test_split(trainval, test_size=val_size/(1-test_size), random_state=11)
train.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
save_split_df(train, 'train', out_dir)
save_split_df(val, 'valid', out_dir)
save_split_df(test, 'test', out_dir)
import sys,os, glob, time
import pickle
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool, cpu_count
# save def. width, height, severity and speed too
class Window_dataset():
def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True):
t0=time.time()
# Get from input
self.input_dir = input_dir
self.out_dir = out_dir
self.filestring = filestring
self.win_size = win_size
self.test = is_test
self.n_processes = 1 #cpu_count()
# Load pickle
self.input_dataframe = self.load_pickle(input_dir, filestring)
# Remove rows with 0 points recorded, n_points[s] = 3.6*fs*defect_width/v[km/h]
if self.test:
self.input_dataframe = self.remove_samples_with_zero_counts(self.input_dataframe).head(100)
self.n_split_rows_length = 20
else:
self.input_dataframe = self.remove_samples_with_zero_counts(self.input_dataframe)
self.n_split_rows_length = 1000
# Take only needed columns
self.input_columns = ['time','distance','speed', 'acceleration', 'severity', 'type', 'defect_width', 'defect_height']
self.deciding_column = 'type'
self.filestring = self.filestring
self.input_dataframe = self.input_dataframe[self.input_columns]
# Window columns to save
self.window_columns = [col for col in self.input_columns if col!=('distance')]
self.window_columns.append('window_class')
# Split input df into smaller ones (to run in parallel on split dataframes)
print('Making split dataframes')
self.n_input_rows = self.input_dataframe.shape[0]
self.last_split = int(self.n_input_rows/self.n_split_rows_length)
self.index_list = [n*self.n_split_rows_length for n in range(1,self.last_split+1)]
self.split_input_dataframes = np.split(self.input_dataframe, self.index_list)
self.n_splits = len(self.split_input_dataframes)
print('Number of split dataframes: {0}'.format(self.n_splits))
print('Number of processes: {0}'.format(self.n_processes))
# Prepare chunks
#print('Preparing chunks')
#self.chunks = [ self.split_input_dataframes[x:x+self.n_processes]
#for x in range(0, len(self.split_input_dataframes), self.n_processes)]
#print('Number of chunks: {0}'.format( len(self.chunks)))
for df_i, df in list(enumerate(self.split_input_dataframes)):
print('===> Passing df: ',df_i)
df.reset_index(inplace=True, drop=True)
self.make_sliding_window_df(df_i, df)
'''
pool = Pool(processes=self.n_processes)
pass_chunk = list(enumerate(chunk))
pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ]
print('Chunk data prepared, starting pool map..')
pool.map(self.make_sliding_window_df, pass_chunk)
pool.close()
pool.join()
'''
#sys.exit(0)
dt = round(time.time()-t0,1)
print('Time to process: {0} s'.format(dt))
def load_pickle(self, input_dir, string):
filename = '{0}/{1}_scaled.pkl'.format(input_dir, string)
print('Loading: {0}'.format(filename))
with open(filename, "rb") as f:
df = pickle.load(f)
return df
def remove_samples_with_zero_counts(self, input_dataframe):
# remove samples with too narrow defects so there is no point 'caught' in type and severity
input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0)
input_dataframe = input_dataframe[ input_dataframe['keep']==True ]
input_dataframe.drop(['keep'],axis=1, inplace = True)
input_dataframe.reset_index(drop=True, inplace=True)
return input_dataframe
def make_sliding_window_df(self, df_i, input_dataframe_part):
print('Making sliding window')
#try: # parallel mode, pass data in chunks
#chunk_i, df_i, input_dataframe_part = inp
#print('Chunk_i: ', chunk_i)
#print('df_i: ',df_i)
window_df = pd.DataFrame([], columns = self.window_columns)
#print('Data: ',type(input_dataframe_part))
# Fill pd with windows from initial one
for index, row in input_dataframe_part.iterrows():
if (index%500==0):
print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0]))
row_df = self.make_sliding_window_row(row)
window_df = window_df.append(row_df)
window_df.reset_index(inplace=True, drop=True)
# Save pickle
self.save_pickle(window_df, self.out_dir, self.filestring+'_'+ str(df_i))
return
def make_sliding_window_row(self, row):
row_df = pd.DataFrame([], columns = self.window_columns)
end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 #
#print(end_index, row.distance[end_index]) # end index in the whole row (so the last sample is 2m)
# Loop over windows
for i in range(0, end_index+1): # to include the last window
try:
# Get min and max index of this window
window_start_meters= row.distance[i]
window_end_meters= window_start_meters + self.win_size
window_end_index = np.where(row.distance>window_end_meters)[0][0]
#print(i, window_end_index, window_start_meters, window_end_meters)
# If the window is fully flat, add with a small prob. equal to how probable is each defect
window_is_flat = np.all(row[self.deciding_column][i: window_end_index]==0)
remove_window = False
if window_is_flat:
remove_window = random.randrange(100)<99 # keep with 2% probability
if remove_window:
continue
# Put this window into row df data
for col in self.window_columns:
#print('Col: ',col)
if col=='window_class': # compute window class column
unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect
#print('uniq: ',unique_classes)
if len(unique_classes)==1:
row_df.at[i,col] = unique_classes[0]
elif len(unique_classes)==2:
row_df.at[i,col] = list(filter(lambda c: c!=0, unique_classes ))[0]
else:
raise Error("More than 1 defect per window not implemented.")
elif isinstance(row[col],np.ndarray): # fill numpy array columns
row_df.at[i,col] = row[col][i: window_end_index]
else:
row_df.at[i,col] = row[col] #float or string, just repeat
#sys.exit(0)
except:
pass
#print(row_df.window_class)
return row_df
def save_pickle(self, df, out_dir, df_type):
print('Saving {0} as pickle.'.format(df_type))
pickle_name = out_dir+'/'+df_type+'_windows.pkl'
df.to_pickle(pickle_name)
print('Wrote output file to: ',pickle_name)
return
# =========SETUP ============== #
# ============================= #
cluster = True
is_test = False
filetype = 'train'
#===============================#
# ============================= #
# Input dir
# =======#
if cluster:
input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized'
else:
input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized'
# Output directory
# =======#
out_dir = '{0}-split-into-windows'.format(input_dir)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
if not os.path.exists(out_dir + '/'+str(filetype)):
os.makedirs(out_dir + '/'+str(filetype))
# Test
# ======#
if is_test:
test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
sys.exit(0)
# Datasets
# ====================== #
result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment