#import the relevant libraries
import os
import pymysql
import pandas as pd
#establish the connection to the mysql database
host = "192.168.88.187"
port = "3306"
user = "backblaze"
password = "Testing.2023"
database = "backblaze_ml"
conn = pymysql.connect(
host=host,
port=int(3306),
user=user,
passwd=password,
db=database,
charset='utf8mb4')
#take a look at the data
sqldf = pd.read_sql_query("select * from drive_stats limit 10", conn)
sqldf
sqldf.shape
sqldf.columns
import datetime
sqldf.date = pd.to_datetime(sqldf.date, format='%Y-%m-%d').dt.date
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df = pd.read_sql_query("describe drive_stats", conn)
df
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
#take a bigger look at the data -> this time selecting only the raw tables and excluding the ones not relevant for correlation
#df = pd.read_sql_query("select date, serial_number, model, smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,smart_9_raw,smart_10_raw,smart_11_raw,smart_12_raw,smart_13_raw,smart_15_raw,smart_16_raw,smart_17_raw,smart_18_raw,smart_22_raw,smart_23_raw,smart_24_raw,smart_160_raw,smart_161_raw,smart_163_raw,smart_164_raw,smart_165_raw,smart_166_raw,smart_167_raw,smart_168_raw,smart_169_raw,smart_170_raw,smart_171_raw,smart_172_raw,smart_173_raw,smart_174_raw,smart_175_raw,smart_176_raw,smart_177_raw,smart_178_raw,smart_179_raw,smart_180_raw,smart_181_raw,smart_182_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,smart_189_raw,smart_190_raw,smart_191_raw,smart_192_raw,smart_193_raw,smart_194_raw,smart_195_raw,smart_196_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_200_raw,smart_201_raw,smart_202_raw,smart_206_raw,smart_210_raw,smart_218_raw,smart_220_raw,smart_222_raw,smart_223_raw,smart_224_raw,smart_225_raw,smart_226_raw,smart_230_raw,smart_231_raw,smart_232_raw,smart_233_raw,smart_234_raw,smart_235_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_244_raw,smart_245_raw,smart_246_raw,smart_247_raw,smart_248_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw, failure from drive_stats limit 1000", conn)
df = pd.read_sql_query("select * from drive_stats", conn)
#df.tail(10)
df
df.fillna(0, inplace = True)
df.tail(10)
df.columns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (-1,1))
df[['smart_5_raw',
'smart_187_raw', 'smart_188_raw', 'smart_189_raw', 'smart_196_raw',
'smart_197_raw', 'smart_198_raw']] = scaler.fit_transform(df[['smart_5_raw',
'smart_187_raw', 'smart_188_raw', 'smart_189_raw', 'smart_196_raw',
'smart_197_raw', 'smart_198_raw']])
#df[['smart_1_raw','smart_2_raw','smart_3_raw','smart_4_raw','smart_5_raw','smart_7_raw','smart_8_raw','smart_9_raw','smart_10_raw','smart_11_raw','smart_12_raw','smart_13_raw','smart_15_raw','smart_16_raw','smart_17_raw','smart_18_raw','smart_22_raw','smart_23_raw','smart_24_raw','smart_160_raw','smart_161_raw','smart_163_raw','smart_164_raw','smart_165_raw','smart_166_raw','smart_167_raw','smart_168_raw','smart_169_raw','smart_170_raw','smart_171_raw','smart_172_raw','smart_173_raw','smart_174_raw','smart_175_raw','smart_176_raw','smart_177_raw','smart_178_raw','smart_179_raw','smart_180_raw','smart_181_raw','smart_182_raw','smart_183_raw','smart_184_raw','smart_187_raw','smart_188_raw','smart_189_raw','smart_190_raw','smart_191_raw','smart_192_raw','smart_193_raw','smart_194_raw','smart_195_raw','smart_196_raw','smart_197_raw','smart_198_raw','smart_199_raw','smart_200_raw','smart_201_raw','smart_202_raw','smart_206_raw','smart_210_raw','smart_218_raw','smart_220_raw','smart_222_raw','smart_223_raw','smart_224_raw','smart_225_raw','smart_226_raw','smart_230_raw','smart_231_raw','smart_232_raw','smart_233_raw','smart_234_raw','smart_235_raw','smart_240_raw','smart_241_raw','smart_242_raw','smart_244_raw','smart_245_raw','smart_246_raw','smart_247_raw','smart_248_raw','smart_250_raw','smart_251_raw','smart_252_raw','smart_254_raw','smart_255_raw']] = scaler.fit_transform(df[['smart_1_raw','smart_2_raw','smart_3_raw','smart_4_raw','smart_5_raw','smart_7_raw','smart_8_raw','smart_9_raw','smart_10_raw','smart_11_raw','smart_12_raw','smart_13_raw','smart_15_raw','smart_16_raw','smart_17_raw','smart_18_raw','smart_22_raw','smart_23_raw','smart_24_raw','smart_160_raw','smart_161_raw','smart_163_raw','smart_164_raw','smart_165_raw','smart_166_raw','smart_167_raw','smart_168_raw','smart_169_raw','smart_170_raw','smart_171_raw','smart_172_raw','smart_173_raw','smart_174_raw','smart_175_raw','smart_176_raw','smart_177_raw','smart_178_raw','smart_179_raw','smart_180_raw','smart_181_raw','smart_182_raw','smart_183_raw','smart_184_raw','smart_187_raw','smart_188_raw','smart_189_raw','smart_190_raw','smart_191_raw','smart_192_raw','smart_193_raw','smart_194_raw','smart_195_raw','smart_196_raw','smart_197_raw','smart_198_raw','smart_199_raw','smart_200_raw','smart_201_raw','smart_202_raw','smart_206_raw','smart_210_raw','smart_218_raw','smart_220_raw','smart_222_raw','smart_223_raw','smart_224_raw','smart_225_raw','smart_226_raw','smart_230_raw','smart_231_raw','smart_232_raw','smart_233_raw','smart_234_raw','smart_235_raw','smart_240_raw','smart_241_raw','smart_242_raw','smart_244_raw','smart_245_raw','smart_246_raw','smart_247_raw','smart_248_raw','smart_250_raw','smart_251_raw','smart_252_raw','smart_254_raw','smart_255_raw']])
#df
df
df.columns
#heatmap
import seaborn as sns
import matplotlib.pyplot as plt
cr=df[['smart_5_raw',
'smart_187_raw', 'smart_188_raw', 'smart_189_raw', 'smart_196_raw',
'smart_197_raw', 'smart_198_raw','failure']].corr()
#colormap = plt.cm.RdBu
plt.figure(figsize=(15,15))
plt.title('Correlation matrix', y=1.05, size=16)
mask = np.zeros_like(cr)
mask[np.triu_indices_from(mask)] = True
#heatm = sns.heatmap(cr, mask = mask, linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
heatm = sns.heatmap(cr, mask = mask, annot=True, cmap="Blues", fmt='.1g')
plt.show()
As we can see from the heatmap above there is strong correlation between smart_198_raw and smart_197_raw Backblaze have chosen both for their prediction model as not all disk manufacturers report both however for my case - the general model (for HDDs in the BB dataset) - I will select only one because all selected disk models report both of them and I'll chose the one with most entries in the database -> smart_197_raw (197 has 15602408 reported entries / 198 has 14792555 reported entries)
Final cleanup stage
mysql> select count(smart_197_raw) from drive_stats ; +----------------------+ | count(smart_197_raw) | +----------------------+ | 15602408 | +----------------------+ 1 row in set (43.70 sec) mysql> select count(smart_198_raw) from drive_stats ; +----------------------+ | count(smart_198_raw) | +----------------------+ | 14792555 | +----------------------+ 1 row in set (41.92 sec) mysql> alter table drive_stats drop column smart_198_raw; Query OK, 15618027 rows affected (11 min 54.73 sec) Records: 15618027 Duplicates: 0 Warnings: 0 mysql> describe drive_stats; +-----------------+-------------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +-----------------+-------------+------+-----+---------+-------+ | date | date | YES | MUL | NULL | | | serial_number | varchar(64) | YES | MUL | NULL | | | model | varchar(64) | YES | MUL | NULL | | | capacity_bytes | bigint | YES | MUL | NULL | | | days_to_failure | int | YES | | NULL | | | failure | int | YES | MUL | NULL | | | smart_5_raw | bigint | YES | | NULL | | | smart_187_raw | bigint | YES | | NULL | | | smart_188_raw | bigint | YES | | NULL | | | smart_189_raw | bigint | YES | | NULL | | | smart_196_raw | bigint | YES | | NULL | | | smart_197_raw | bigint | YES | | NULL | | +-----------------+-------------+------+-----+---------+-------+ 12 rows in set (0.02 sec)