Skip to content
Snippets Groups Projects
Commit a37c48f9 authored by mibaj's avatar mibaj
Browse files

Update transform_to_window_dataset.py

parent d9be7324
Branches master
No related tags found
No related merge requests found
"""
A script to apply the sliding window approach on input time series data. Creates fully prepared datasets for analysis.
@author: Milena Bajic (DTU Compute)
"""
import sys,os, glob, time
import argparse
import pickle
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool, cpu_count
# save def. width, height, severity and speed too
class Window_dataset():
def __init__(self, input_dir, filestring, win_size = 2, out_dir = '', is_test = True):
# Initial processing time
t0=time.time()
# Get from input
......@@ -55,28 +62,10 @@ class Window_dataset():
print('Number of split dataframes: {0}'.format(self.n_splits))
print('Number of processes: {0}'.format(self.n_processes))
# Prepare chunks
#print('Preparing chunks')
#self.chunks = [ self.split_input_dataframes[x:x+self.n_processes]
#for x in range(0, len(self.split_input_dataframes), self.n_processes)]
#print('Number of chunks: {0}'.format( len(self.chunks)))
for df_i, df in list(enumerate(self.split_input_dataframes)):
print('===> Passing df: ',df_i)
df.reset_index(inplace=True, drop=True)
self.make_sliding_window_df(df_i, df)
'''
pool = Pool(processes=self.n_processes)
pass_chunk = list(enumerate(chunk))
pass_chunk = [(chunk_number,) + c for c in pass_chunk] # [ (chunk_number, df_number, chunk_data) ]
print('Chunk data prepared, starting pool map..')
pool.map(self.make_sliding_window_df, pass_chunk)
pool.close()
pool.join()
'''
#sys.exit(0)
dt = round(time.time()-t0,1)
print('Time to process: {0} s'.format(dt))
......@@ -86,31 +75,23 @@ class Window_dataset():
print('Loading: {0}'.format(filename))
with open(filename, "rb") as f:
df = pickle.load(f)
return df
def remove_samples_with_zero_counts(self, input_dataframe):
# remove samples with too narrow defects so there is no point 'caught' in type and severity
# Remove samples with too narrow defects so there is no point "caught" in type and severity
input_dataframe['keep'] = input_dataframe.type.apply(lambda row: np.count_nonzero(row)>0)
input_dataframe = input_dataframe[ input_dataframe['keep']==True ]
input_dataframe.drop(['keep'],axis=1, inplace = True)
input_dataframe.reset_index(drop=True, inplace=True)
return input_dataframe
def make_sliding_window_df(self, df_i, input_dataframe_part):
# Making sliding window (each window: constant in distance, variable length, slide by 1 point)
print('Making sliding window')
#try: # parallel mode, pass data in chunks
#chunk_i, df_i, input_dataframe_part = inp
#print('Chunk_i: ', chunk_i)
#print('df_i: ',df_i)
window_df = pd.DataFrame([], columns = self.window_columns)
#print('Data: ',type(input_dataframe_part))
# Fill pd with windows from initial one
# Fill Dataframe with windows from initial one
for index, row in input_dataframe_part.iterrows():
if (index%500==0):
print('Processing row: {0}/{1}'.format(index,input_dataframe_part.shape[0]))
......@@ -129,8 +110,8 @@ class Window_dataset():
end_index = np.where(row.distance > 100 - self.win_size )[0][0]-1 #
#print(end_index, row.distance[end_index]) # end index in the whole row (so the last sample is 2m)
# Loop over windows
for i in range(0, end_index+1): # to include the last window
# Loop over the windows
for i in range(0, end_index+1):
try:
# Get min and max index of this window
window_start_meters= row.distance[i]
......@@ -148,10 +129,8 @@ class Window_dataset():
# Put this window into row df data
for col in self.window_columns:
#print('Col: ',col)
if col=='window_class': # compute window class column
unique_classes = np.unique(row['type'][i: window_end_index]) #possible are only 1-label windows or windows with 0 (no defect) and 1 defect
#print('uniq: ',unique_classes)
if len(unique_classes)==1:
row_df.at[i,col] = unique_classes[0]
elif len(unique_classes)==2:
......@@ -162,14 +141,12 @@ class Window_dataset():
row_df.at[i,col] = row[col][i: window_end_index]
else:
row_df.at[i,col] = row[col] #float or string, just repeat
#sys.exit(0)
except:
pass
#print(row_df.window_class)
return row_df
def save_pickle(self, df, out_dir, df_type):
print('Saving {0} as pickle.'.format(df_type))
pickle_name = out_dir+'/'+df_type+'_windows.pkl'
df.to_pickle(pickle_name)
......@@ -177,39 +154,38 @@ class Window_dataset():
return
#===============================#
# ============================= #
# =========SETUP ============== #
# ============================= #
if __name__ == "__main__":
cluster = True
is_test = False
filetype = 'train'
home = os.path.expanduser('~')
parser = argparse.ArgumentParser(description='Please provide command line arguments.')
parser.add_argument('--test', default = False,
help = 'If test is true, will process 100 rows only (use for testing purposes).')
parser.add_argument('--filetype', default = 'train',
help = 'Choose between train, test or valid. This file will be processed and output created.')
parser.add_argument('--input_dir', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020/train-val-test-normalized'.format(home),
help = 'Input directory containing single-defect .mat files.')
parser.add_argument('--output_dir_base', default = '{0}/quarter_car_model_data_preparation/data/Golden-car-simulation-August-2020'.format(home),
help='Directory base where a new directory with output files will be created.')
#===============================#
# ============================= #
args = parser.parse_args()
# Input dir
# =======#
if cluster:
input_dir = '/dtu-compute/mibaj/Golden-car-simulation-August-2020/train-val-test-normalized'
else:
input_dir = '/Users/mibaj/quarter-car-simulation-data-analysis/data/Golden-car-simulation-August-2020/train-val-test-normalized'
input_dir = args.input_dir
output_dir_base = args.output_dir_base
is_test = args.test
filetype = args.filetype
# Output directory
# =======#
out_dir = '{0}-split-into-windows'.format(input_dir)
# Make output directory
out_dir = '{0}/train-val-test-normalized-split-into-windows'.format(output_dir_base)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
if not os.path.exists(out_dir + '/'+str(filetype)):
os.makedirs(out_dir + '/'+str(filetype))
# Test
# Process
# ======#
if is_test:
test = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
sys.exit(0)
# Datasets
# ====================== #
result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype), is_test = is_test)
result = Window_dataset(input_dir, 'test', out_dir = out_dir, is_test = is_test)
else:
result = Window_dataset(input_dir, filetype, out_dir = out_dir + '/'+str(filetype))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment