#import the relevant libraries 
import os
import pymysql
import pandas as pd


#establish the connection to the mysql database
host = "192.168.88.187"
port = "3306"
user = "backblaze"
password = "Testing.2023"
database = "backblaze_ml"

conn = pymysql.connect(
    host=host,
    port=int(3306),
    user=user,
    passwd=password,
    db=database,
    charset='utf8mb4')


#for this experiment I'm going to work on the data for all drives (all models)
sqldf = pd.read_sql_query("select * from drive_stats where date >= '2014-03-01' and serial_number in (select distinct(serial_number) from drive_stats where failure=1 and date >= '2014-03-01')", conn)
sqldf

/tmp/ipykernel_2251744/2026978849.py:2: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  sqldf = pd.read_sql_query("select * from drive_stats where date >= '2014-03-01' and serial_number in (select distinct(serial_number) from drive_stats where failure=1 and date >= '2014-03-01')", conn)


#x is the variable, n is the number of days before actual failure to consider
def preFailOn(x, n):
  if x.days_to_failure == 0:
    return None
  elif x.days_to_failure <= n:
    return 1
  else:
    return 0


traindf = sqldf.copy()
traindf['prefailure'] = traindf.apply(lambda row: preFailOn(row, 14), axis=1)
#drop the rows where prefailure is NaN (aka where failure = 1)
traindf = traindf.dropna(subset=['prefailure'])
traindf


import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1337)
from IPython.display import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
from pandas import read_csv
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


#The following settings will be used to avoid exponential values in output or tables and to display 50 rows maximum:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_rows=50


#fix null values
traindf.fillna(0, inplace = True)
#when training the model, we don't need failure but prefailure for this experiment
traindf = traindf.drop(columns=['days_to_failure','capacity_bytes', 'model', 'serial_number', 'date', 'failure'])
traindf


traindf.describe().T


obj = traindf.dtypes[traindf.dtypes == object ].index  
obj

Index([], dtype='object')


#here we split the dataset into 70/30 train/test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(traindf[traindf.columns[:-1]], 
                                                  traindf[traindf.columns[-1:]] ,
                                                  stratify=traindf[traindf.columns[-1:]], 
                                                  test_size=0.30)


X_train


Y_train


X_test


Y_test


import joblib

#Building the Random Forest Classifier (RANDOM FOREST) 
from sklearn.ensemble import RandomForestClassifier 

# random forest model creation 
rfc = RandomForestClassifier() 
rfc.fit(X_train, Y_train) 

#save the model
joblib.dump(rfc, "./dissertation-ml-experiment4-randomforestclassifier-predict-state-14days-before-failure-practical-allhddmodels.joblib")

/tmp/ipykernel_2251744/3765841894.py:8: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  rfc.fit(X_train, Y_train)

['./dissertation-ml-experiment4-randomforestclassifier-predict-state-14days-before-failure-practical-allhddmodels.joblib']


# predictions(Notice the caps'P' of yPred to differentiate between model 1 and 2) 
yPred = rfc.predict(X_test)


results = pd.DataFrame({'Actual':Y_test['prefailure']})
results['Predicted'] = yPred
results


yp = results['Predicted']
yt = results['Actual']


#Results of our predictions

from sklearn.metrics import classification_report, accuracy_score  
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 

n_errors = (yt != yp).sum()        #here we count the number of cases where predicted and actual are different
print("Model used is: Random Forest classifier") 
  
acc = accuracy_score(yt, yp) 
print("The accuracy is {}".format(acc)) 
  
prec = precision_score(yt, yp) 
print("The precision is {}".format(prec)) 
  
rec = recall_score(yt, yp) 
print("The recall is {}".format(rec)) 
  
f1 = f1_score(yt, yp) 
print("The F1-Score is {}".format(f1)) 
  
MCC = matthews_corrcoef(yt, yp) 
print("The Matthews correlation coefficient is {}".format(MCC))

Model used is: Random Forest classifier
The accuracy is 0.9861453280770638
The precision is 0.79528693247292
The recall is 0.19326789789831839
The F1-Score is 0.31096592509152354
The Matthews correlation coefficient is 0.3879839381031198


 # confusion matrix 

LABELS = ['Healthy', 'Failed'] 
conf_matrix = confusion_matrix(Y_test, yPred) 
plt.figure(figsize =(12, 12)) 
sns.heatmap(conf_matrix, xticklabels = LABELS,  
            yticklabels = LABELS, annot = True, fmt ="d"); 
plt.title("Confusion matrix") 
plt.ylabel('True class') 
plt.xlabel('Predicted class') 
plt.show()

	date	serial_number	model	capacity_bytes	days_to_failure	failure	smart_5_raw	smart_187_raw	smart_188_raw	smart_189_raw	smart_196_raw	smart_197_raw
0	2014-03-01	MJ1311YNG36USA	Hitachi HDS5C3030ALA630	3000592982016	991	0	67.0	NaN	NaN	NaN	101.0	0.0
1	2014-03-01	MJ1311YNG733NA	Hitachi HDS5C3030ALA630	3000592982016	840	0	0.0	NaN	NaN	NaN	0.0	0.0
2	2014-03-01	W3009AX6	ST4000DM000	4000787030016	54	0	0.0	0.0	0.000000e+00	1.0	NaN	8.0
3	2014-03-01	WD-WCAV5M690585	WDC WD10EADS	1000204886016	409	0	0.0	NaN	NaN	NaN	0.0	0.0
4	2014-03-01	S1F0CSW2	ST3000DM001	3000592982016	229	0	0.0	0.0	7.301556e+10	0.0	NaN	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...
14734298	2023-03-31	ZCH06VE2	ST12000NM0007	12000138625024	0	0	0.0	1.0	0.000000e+00	NaN	NaN	0.0
14734299	2023-03-31	X8L0A01BF97G	TOSHIBA MG07ACA14TA	14000519643136	0	0	0.0	NaN	NaN	NaN	0.0	0.0
14734300	2023-03-31	9JG4657T	WDC WUH721414ALE6L4	14000519643136	0	0	0.0	NaN	NaN	NaN	0.0	0.0
14734301	2023-03-31	6090A00RFVKG	TOSHIBA MG08ACA16TA	16000900661248	0	0	0.0	NaN	NaN	NaN	0.0	0.0
14734302	2023-03-31	51R0A2Q8FVGG	TOSHIBA MG08ACA16TE	16000900661248	0	0	0.0	NaN	NaN	NaN	0.0	0.0

	date	serial_number	model	capacity_bytes	days_to_failure	failure	smart_5_raw	smart_187_raw	smart_188_raw	smart_189_raw	smart_196_raw	smart_197_raw	prefailure
0	2014-03-01	MJ1311YNG36USA	Hitachi HDS5C3030ALA630	3000592982016	991	0	67.0	NaN	NaN	NaN	101.0	0.0	0.0
1	2014-03-01	MJ1311YNG733NA	Hitachi HDS5C3030ALA630	3000592982016	840	0	0.0	NaN	NaN	NaN	0.0	0.0	0.0
2	2014-03-01	W3009AX6	ST4000DM000	4000787030016	54	0	0.0	0.0	0.000000e+00	1.0	NaN	8.0	0.0
3	2014-03-01	WD-WCAV5M690585	WDC WD10EADS	1000204886016	409	0	0.0	NaN	NaN	NaN	0.0	0.0	0.0
4	2014-03-01	S1F0CSW2	ST3000DM001	3000592982016	229	0	0.0	0.0	7.301556e+10	0.0	NaN	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
14734123	2023-03-30	ZCH06VE2	ST12000NM0007	12000138625024	1	0	0.0	1.0	0.000000e+00	NaN	NaN	0.0	1.0
14734124	2023-03-30	X8L0A01BF97G	TOSHIBA MG07ACA14TA	14000519643136	1	0	0.0	NaN	NaN	NaN	0.0	0.0	1.0
14734126	2023-03-30	9JG4657T	WDC WUH721414ALE6L4	14000519643136	1	0	0.0	NaN	NaN	NaN	0.0	0.0	1.0
14734128	2023-03-30	6090A00RFVKG	TOSHIBA MG08ACA16TA	16000900661248	1	0	0.0	NaN	NaN	NaN	0.0	0.0	1.0
14734129	2023-03-30	51R0A2Q8FVGG	TOSHIBA MG08ACA16TE	16000900661248	1	0	0.0	NaN	NaN	NaN	0.0	0.0	1.0

	smart_5_raw	smart_187_raw	smart_188_raw	smart_189_raw	smart_196_raw	smart_197_raw	prefailure
0	67.000	0.000	0.000	0.000	101.000	0.000	0.000
1	0.000	0.000	0.000	0.000	0.000	0.000	0.000
2	0.000	0.000	0.000	1.000	0.000	8.000	0.000
3	0.000	0.000	0.000	0.000	0.000	0.000	0.000
4	0.000	0.000	73015558161.000	0.000	0.000	0.000	0.000
...	...	...	...	...	...	...	...
14734123	0.000	1.000	0.000	0.000	0.000	0.000	1.000
14734124	0.000	0.000	0.000	0.000	0.000	0.000	1.000
14734126	0.000	0.000	0.000	0.000	0.000	0.000	1.000
14734128	0.000	0.000	0.000	0.000	0.000	0.000	1.000
14734129	0.000	0.000	0.000	0.000	0.000	0.000	1.000

	count	mean	std	max
smart_5_raw	14716816.000	167.991	2089.559	65528.000
smart_187_raw	14716816.000	5.031	285.034	65535.000
smart_188_raw	14716816.000	1743647741.141	80449956585.986	10196408011086.000
smart_189_raw	14716816.000	5.959	523.304	65535.000
smart_196_raw	14716816.000	3.622	70.742	9031.000
smart_197_raw	14716816.000	11.524	946.471	462016.000
prefailure	14716816.000	0.016	0.126	1.000

	smart_5_raw	smart_187_raw	smart_188_raw	smart_189_raw	smart_196_raw	smart_197_raw
14100879	0.000	0.000	0.000	0.000	0.000	0.000
4695531	0.000	0.000	0.000	0.000	0.000	0.000
7070152	0.000	0.000	0.000	0.000	0.000	0.000
3122247	0.000	0.000	0.000	0.000	0.000	0.000
9590837	0.000	0.000	0.000	0.000	0.000	0.000
...	...	...	...	...	...	...
7147799	0.000	0.000	0.000	0.000	0.000	0.000
109774	0.000	0.000	0.000	1.000	0.000	0.000
14733017	0.000	0.000	0.000	0.000	0.000	0.000
1621313	0.000	0.000	0.000	0.000	0.000	0.000
10544676	8.000	23.000	0.000	0.000	0.000	8.000

	smart_5_raw	smart_187_raw	smart_188_raw	smart_189_raw	smart_196_raw	smart_197_raw
9917009	0.000	0.000	0.000	0.000	0.000	0.000
9177127	0.000	0.000	0.000	0.000	0.000	0.000
13290672	0.000	0.000	0.000	18.000	0.000	0.000
4478932	0.000	0.000	0.000	0.000	0.000	0.000
7245520	0.000	0.000	0.000	0.000	0.000	0.000
...	...	...	...	...	...	...
5602563	0.000	0.000	0.000	0.000	0.000	0.000
7134641	16.000	0.000	0.000	0.000	0.000	0.000
333634	0.000	0.000	0.000	2.000	0.000	0.000
946427	26352.000	0.000	77310590994.000	1.000	0.000	0.000
6659736	0.000	0.000	0.000	0.000	0.000	0.000