validate thresholds whenever SMART data is recieved.
This commit is contained in:
@@ -2,6 +2,7 @@ package database
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements"
|
||||
@@ -16,6 +17,7 @@ type DeviceRepo interface {
|
||||
RegisterDevice(ctx context.Context, dev models.Device) error
|
||||
GetDevices(ctx context.Context) ([]models.Device, error)
|
||||
UpdateDevice(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (models.Device, error)
|
||||
UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error)
|
||||
GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error)
|
||||
|
||||
SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error)
|
||||
|
||||
@@ -3,6 +3,7 @@ package database
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/config"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
|
||||
@@ -163,6 +164,17 @@ func (sr *scrutinyRepository) UpdateDevice(ctx context.Context, wwn string, coll
|
||||
return device, sr.gormClient.Model(&device).Updates(device).Error
|
||||
}
|
||||
|
||||
//Update Device Status
|
||||
func (sr *scrutinyRepository) UpdateDeviceStatus(ctx context.Context, wwn string, status pkg.DeviceStatus) (models.Device, error) {
|
||||
var device models.Device
|
||||
if err := sr.gormClient.WithContext(ctx).Where("wwn = ?", wwn).First(&device).Error; err != nil {
|
||||
return device, fmt.Errorf("Could not get device from DB", err)
|
||||
}
|
||||
|
||||
device.DeviceStatus = pkg.Set(device.DeviceStatus, status)
|
||||
return device, sr.gormClient.Model(&device).Updates(device).Error
|
||||
}
|
||||
|
||||
func (sr *scrutinyRepository) GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error) {
|
||||
var device models.Device
|
||||
|
||||
@@ -434,3 +446,11 @@ func (sr *scrutinyRepository) GetSummary(ctx context.Context) (map[string]*model
|
||||
|
||||
return summaries, nil
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Process Thresholds
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
func (sr *scrutinyRepository) ProcessSmartAttributeThresholds() {
|
||||
|
||||
}
|
||||
|
||||
@@ -3,8 +3,8 @@ package measurements
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -22,6 +22,9 @@ type Smart struct {
|
||||
|
||||
//Attributes (fields)
|
||||
Attributes map[string]SmartAttribute `json:"attrs"`
|
||||
|
||||
//status
|
||||
Status pkg.DeviceStatus
|
||||
}
|
||||
|
||||
func (sm *Smart) Flatten() (tags map[string]string, fields map[string]interface{}) {
|
||||
@@ -133,6 +136,7 @@ func (sm *Smart) FromCollectorSmartInfo(wwn string, info collector.SmartInfo) er
|
||||
|
||||
//generate SmartAtaAttribute entries from Scrutiny Collector Smart data.
|
||||
func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
|
||||
sm.Status = pkg.DeviceStatusPassed
|
||||
for _, collectorAttr := range info.AtaSmartAttributes.Table {
|
||||
attrModel := SmartAtaAttribute{
|
||||
AttributeId: collectorAttr.ID,
|
||||
@@ -146,53 +150,72 @@ func (sm *Smart) ProcessAtaSmartInfo(info collector.SmartInfo) {
|
||||
}
|
||||
|
||||
//now that we've parsed the data from the smartctl response, lets match it against our metadata rules and add additional Scrutiny specific data.
|
||||
if smartMetadata, ok := metadata.AtaMetadata[collectorAttr.ID]; ok {
|
||||
if smartMetadata, ok := thresholds.AtaMetadata[collectorAttr.ID]; ok {
|
||||
attrModel.Name = smartMetadata.DisplayName
|
||||
if smartMetadata.Transform != nil {
|
||||
attrModel.TransformedValue = smartMetadata.Transform(attrModel.Value, attrModel.RawValue, attrModel.RawString)
|
||||
}
|
||||
}
|
||||
attrModel.PopulateAttributeStatus()
|
||||
sm.Attributes[string(collectorAttr.ID)] = &attrModel
|
||||
if attrModel.Status == pkg.SmartAttributeStatusFailed {
|
||||
sm.Status = pkg.DeviceStatusFailedScrutiny
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//generate SmartNvmeAttribute entries from Scrutiny Collector Smart data.
|
||||
func (sm *Smart) ProcessNvmeSmartInfo(info collector.SmartInfo) {
|
||||
|
||||
sm.Attributes = map[string]SmartAttribute{
|
||||
"critical_warning": &SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0},
|
||||
"temperature": &SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1},
|
||||
"available_spare": &SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold},
|
||||
"percentage_used": &SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100},
|
||||
"data_units_read": &SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1},
|
||||
"data_units_written": &SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1},
|
||||
"host_reads": &SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1},
|
||||
"host_writes": &SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1},
|
||||
"controller_busy_time": &SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1},
|
||||
"power_cycles": &SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1},
|
||||
"power_on_hours": &SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1},
|
||||
"unsafe_shutdowns": &SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1},
|
||||
"media_errors": &SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0},
|
||||
"num_err_log_entries": &SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0},
|
||||
"warning_temp_time": &SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1},
|
||||
"critical_comp_time": &SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1},
|
||||
"critical_warning": (&SmartNvmeAttribute{AttributeId: "critical_warning", Name: "Critical Warning", Value: info.NvmeSmartHealthInformationLog.CriticalWarning, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"temperature": (&SmartNvmeAttribute{AttributeId: "temperature", Name: "Temperature", Value: info.NvmeSmartHealthInformationLog.Temperature, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"available_spare": (&SmartNvmeAttribute{AttributeId: "available_spare", Name: "Available Spare", Value: info.NvmeSmartHealthInformationLog.AvailableSpare, Threshold: info.NvmeSmartHealthInformationLog.AvailableSpareThreshold}).PopulateAttributeStatus(),
|
||||
"percentage_used": (&SmartNvmeAttribute{AttributeId: "percentage_used", Name: "Percentage Used", Value: info.NvmeSmartHealthInformationLog.PercentageUsed, Threshold: 100}).PopulateAttributeStatus(),
|
||||
"data_units_read": (&SmartNvmeAttribute{AttributeId: "data_units_read", Name: "Data Units Read", Value: info.NvmeSmartHealthInformationLog.DataUnitsRead, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"data_units_written": (&SmartNvmeAttribute{AttributeId: "data_units_written", Name: "Data Units Written", Value: info.NvmeSmartHealthInformationLog.DataUnitsWritten, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"host_reads": (&SmartNvmeAttribute{AttributeId: "host_reads", Name: "Host Reads", Value: info.NvmeSmartHealthInformationLog.HostReads, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"host_writes": (&SmartNvmeAttribute{AttributeId: "host_writes", Name: "Host Writes", Value: info.NvmeSmartHealthInformationLog.HostWrites, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"controller_busy_time": (&SmartNvmeAttribute{AttributeId: "controller_busy_time", Name: "Controller Busy Time", Value: info.NvmeSmartHealthInformationLog.ControllerBusyTime, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"power_cycles": (&SmartNvmeAttribute{AttributeId: "power_cycles", Name: "Power Cycles", Value: info.NvmeSmartHealthInformationLog.PowerCycles, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"power_on_hours": (&SmartNvmeAttribute{AttributeId: "power_on_hours", Name: "Power on Hours", Value: info.NvmeSmartHealthInformationLog.PowerOnHours, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"unsafe_shutdowns": (&SmartNvmeAttribute{AttributeId: "unsafe_shutdowns", Name: "Unsafe Shutdowns", Value: info.NvmeSmartHealthInformationLog.UnsafeShutdowns, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"media_errors": (&SmartNvmeAttribute{AttributeId: "media_errors", Name: "Media Errors", Value: info.NvmeSmartHealthInformationLog.MediaErrors, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"num_err_log_entries": (&SmartNvmeAttribute{AttributeId: "num_err_log_entries", Name: "Numb Err Log Entries", Value: info.NvmeSmartHealthInformationLog.NumErrLogEntries, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"warning_temp_time": (&SmartNvmeAttribute{AttributeId: "warning_temp_time", Name: "Warning Temp Time", Value: info.NvmeSmartHealthInformationLog.WarningTempTime, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"critical_comp_time": (&SmartNvmeAttribute{AttributeId: "critical_comp_time", Name: "Critical CompTime", Value: info.NvmeSmartHealthInformationLog.CriticalCompTime, Threshold: -1}).PopulateAttributeStatus(),
|
||||
}
|
||||
|
||||
//find analyzed attribute status
|
||||
for _, val := range sm.Attributes {
|
||||
if val.GetStatus() == pkg.SmartAttributeStatusFailed {
|
||||
sm.Status = pkg.DeviceStatusFailedScrutiny
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//generate SmartScsiAttribute entries from Scrutiny Collector Smart data.
|
||||
func (sm *Smart) ProcessScsiSmartInfo(info collector.SmartInfo) {
|
||||
sm.Attributes = map[string]SmartAttribute{
|
||||
"scsi_grown_defect_list": &SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0},
|
||||
"read_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1},
|
||||
"read_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1},
|
||||
"read_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
|
||||
"read_total_errors_corrected": &SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1},
|
||||
"read_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1},
|
||||
"read_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0},
|
||||
"write_errors_corrected_by_eccfast": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1},
|
||||
"write_errors_corrected_by_eccdelayed": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1},
|
||||
"write_errors_corrected_by_rereads_rewrites": &SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0},
|
||||
"write_total_errors_corrected": &SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1},
|
||||
"write_correction_algorithm_invocations": &SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1},
|
||||
"write_total_uncorrected_errors": &SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0},
|
||||
"scsi_grown_defect_list": (&SmartScsiAttribute{AttributeId: "scsi_grown_defect_list", Name: "Grown Defect List", Value: info.ScsiGrownDefectList, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"read_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccfast", Name: "Read Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"read_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_eccdelayed", Name: "Read Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"read_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "read_errors_corrected_by_rereads_rewrites", Name: "Read Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Read.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"read_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "read_total_errors_corrected", Name: "Read Total Errors Corrected", Value: info.ScsiErrorCounterLog.Read.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"read_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "read_correction_algorithm_invocations", Name: "Read Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Read.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"read_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "read_total_uncorrected_errors", Name: "Read Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Read.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"write_errors_corrected_by_eccfast": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccfast", Name: "Write Errors Corrected by ECC Fast", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccfast, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"write_errors_corrected_by_eccdelayed": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_eccdelayed", Name: "Write Errors Corrected by ECC Delayed", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByEccdelayed, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"write_errors_corrected_by_rereads_rewrites": (&SmartScsiAttribute{AttributeId: "write_errors_corrected_by_rereads_rewrites", Name: "Write Errors Corrected by ReReads/ReWrites", Value: info.ScsiErrorCounterLog.Write.ErrorsCorrectedByRereadsRewrites, Threshold: 0}).PopulateAttributeStatus(),
|
||||
"write_total_errors_corrected": (&SmartScsiAttribute{AttributeId: "write_total_errors_corrected", Name: "Write Total Errors Corrected", Value: info.ScsiErrorCounterLog.Write.TotalErrorsCorrected, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"write_correction_algorithm_invocations": (&SmartScsiAttribute{AttributeId: "write_correction_algorithm_invocations", Name: "Write Correction Algorithm Invocations", Value: info.ScsiErrorCounterLog.Write.CorrectionAlgorithmInvocations, Threshold: -1}).PopulateAttributeStatus(),
|
||||
"write_total_uncorrected_errors": (&SmartScsiAttribute{AttributeId: "write_total_uncorrected_errors", Name: "Write Total Uncorrected Errors", Value: info.ScsiErrorCounterLog.Write.TotalUncorrectedErrors, Threshold: 0}).PopulateAttributeStatus(),
|
||||
}
|
||||
|
||||
//find analyzed attribute status
|
||||
for _, val := range sm.Attributes {
|
||||
if val.GetStatus() == pkg.SmartAttributeStatusFailed {
|
||||
sm.Status = pkg.DeviceStatusFailedScrutiny
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,14 +2,12 @@ package measurements
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const SmartAttributeStatusPassed = "passed"
|
||||
const SmartAttributeStatusFailed = "failed"
|
||||
const SmartAttributeStatusWarning = "warn"
|
||||
|
||||
type SmartAtaAttribute struct {
|
||||
AttributeId int `json:"attribute_id"`
|
||||
Name string `json:"name"`
|
||||
@@ -27,6 +25,10 @@ type SmartAtaAttribute struct {
|
||||
FailureRate float64 `json:"failure_rate,omitempty"`
|
||||
}
|
||||
|
||||
func (sa *SmartAtaAttribute) GetStatus() string {
|
||||
return sa.Status
|
||||
}
|
||||
|
||||
func (sa *SmartAtaAttribute) Flatten() map[string]interface{} {
|
||||
|
||||
idString := strconv.Itoa(sa.AttributeId)
|
||||
@@ -71,81 +73,82 @@ func (sa *SmartAtaAttribute) Inflate(key string, val interface{}) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
////populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
//func (sa *SmartAtaAttribute) PopulateAttributeStatus() {
|
||||
// if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedFailingNow {
|
||||
// //this attribute has previously failed
|
||||
// sa.Status = SmartAttributeStatusFailed
|
||||
// sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
|
||||
//
|
||||
// } else if strings.ToUpper(sa.WhenFailed) == SmartWhenFailedInThePast {
|
||||
// sa.Status = SmartAttributeStatusWarning
|
||||
// sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
|
||||
// }
|
||||
//
|
||||
// if smartMetadata, ok := metadata.AtaMetadata[sa.AttributeId]; ok {
|
||||
// sa.MetadataObservedThresholdStatus(smartMetadata)
|
||||
// }
|
||||
//
|
||||
// //check if status is blank, set to "passed"
|
||||
// if len(sa.Status) == 0 {
|
||||
// sa.Status = SmartAttributeStatusPassed
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
|
||||
//func (sa *SmartAtaAttribute) MetadataObservedThresholdStatus(smartMetadata metadata.AtaAttributeMetadata) {
|
||||
// //TODO: multiple rules
|
||||
// // try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
|
||||
// // - if the attribute is critical
|
||||
// // - the failure rate is over 10 - set to failed
|
||||
// // - the attribute does not match any threshold, set to warn
|
||||
// // - if the attribute is not critical
|
||||
// // - if failure rate is above 20 - set to failed
|
||||
// // - if failure rate is above 10 but below 20 - set to warn
|
||||
//
|
||||
// //update the smart attribute status based on Observed thresholds.
|
||||
// var value int64
|
||||
// if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeNormalized {
|
||||
// value = int64(sa.Value)
|
||||
// } else if smartMetadata.DisplayType == metadata.AtaSmartAttributeDisplayTypeTransformed {
|
||||
// value = sa.TransformedValue
|
||||
// } else {
|
||||
// value = sa.RawValue
|
||||
// }
|
||||
//
|
||||
// for _, obsThresh := range smartMetadata.ObservedThresholds {
|
||||
//
|
||||
// //check if "value" is in this bucket
|
||||
// if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
|
||||
// (obsThresh.Low < value && value <= obsThresh.High) {
|
||||
// sa.FailureRate = obsThresh.AnnualFailureRate
|
||||
//
|
||||
// if smartMetadata.Critical {
|
||||
// if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
// sa.Status = SmartAttributeStatusFailed
|
||||
// sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
|
||||
// }
|
||||
// } else {
|
||||
// if obsThresh.AnnualFailureRate >= 0.20 {
|
||||
// sa.Status = SmartAttributeStatusFailed
|
||||
// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
|
||||
// } else if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
// sa.Status = SmartAttributeStatusWarning
|
||||
// sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// //we've found the correct bucket, we can drop out of this loop
|
||||
// return
|
||||
// }
|
||||
// }
|
||||
// // no bucket found
|
||||
// if smartMetadata.Critical {
|
||||
// sa.Status = SmartAttributeStatusWarning
|
||||
// sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
|
||||
// }
|
||||
//
|
||||
// return
|
||||
//}
|
||||
//populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
// Chainable
|
||||
func (sa *SmartAtaAttribute) PopulateAttributeStatus() *SmartAtaAttribute {
|
||||
if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedFailingNow {
|
||||
//this attribute has previously failed
|
||||
sa.Status = pkg.SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Attribute is failing manufacturer SMART threshold"
|
||||
|
||||
} else if strings.ToUpper(sa.WhenFailed) == pkg.SmartWhenFailedInThePast {
|
||||
sa.Status = pkg.SmartAttributeStatusWarning
|
||||
sa.StatusReason = "Attribute has previously failed manufacturer SMART threshold"
|
||||
}
|
||||
|
||||
if smartMetadata, ok := thresholds.AtaMetadata[sa.AttributeId]; ok {
|
||||
sa.ValidateThreshold(smartMetadata)
|
||||
}
|
||||
|
||||
//check if status is blank, set to "passed"
|
||||
if len(sa.Status) == 0 {
|
||||
sa.Status = pkg.SmartAttributeStatusPassed
|
||||
}
|
||||
return sa
|
||||
}
|
||||
|
||||
// compare the attribute (raw, normalized, transformed) value to observed thresholds, and update status if necessary
|
||||
func (sa *SmartAtaAttribute) ValidateThreshold(smartMetadata thresholds.AtaAttributeMetadata) {
|
||||
//TODO: multiple rules
|
||||
// try to predict the failure rates for observed thresholds that have 0 failure rate and error bars.
|
||||
// - if the attribute is critical
|
||||
// - the failure rate is over 10 - set to failed
|
||||
// - the attribute does not match any threshold, set to warn
|
||||
// - if the attribute is not critical
|
||||
// - if failure rate is above 20 - set to failed
|
||||
// - if failure rate is above 10 but below 20 - set to warn
|
||||
|
||||
//update the smart attribute status based on Observed thresholds.
|
||||
var value int64
|
||||
if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeNormalized {
|
||||
value = int64(sa.Value)
|
||||
} else if smartMetadata.DisplayType == thresholds.AtaSmartAttributeDisplayTypeTransformed {
|
||||
value = sa.TransformedValue
|
||||
} else {
|
||||
value = sa.RawValue
|
||||
}
|
||||
|
||||
for _, obsThresh := range smartMetadata.ObservedThresholds {
|
||||
|
||||
//check if "value" is in this bucket
|
||||
if ((obsThresh.Low == obsThresh.High) && value == obsThresh.Low) ||
|
||||
(obsThresh.Low < value && value <= obsThresh.High) {
|
||||
sa.FailureRate = obsThresh.AnnualFailureRate
|
||||
|
||||
if smartMetadata.Critical {
|
||||
if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
sa.Status = pkg.SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Observed Failure Rate for Critical Attribute is greater than 10%"
|
||||
}
|
||||
} else {
|
||||
if obsThresh.AnnualFailureRate >= 0.20 {
|
||||
sa.Status = pkg.SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 20%"
|
||||
} else if obsThresh.AnnualFailureRate >= 0.10 {
|
||||
sa.Status = pkg.SmartAttributeStatusWarning
|
||||
sa.StatusReason = "Observed Failure Rate for Attribute is greater than 10%"
|
||||
}
|
||||
}
|
||||
|
||||
//we've found the correct bucket, we can drop out of this loop
|
||||
return
|
||||
}
|
||||
}
|
||||
// no bucket found
|
||||
if smartMetadata.Critical {
|
||||
sa.Status = pkg.SmartAttributeStatusWarning
|
||||
sa.StatusReason = "Could not determine Observed Failure Rate for Critical Attribute"
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@@ -3,4 +3,5 @@ package measurements
|
||||
type SmartAttribute interface {
|
||||
Flatten() (fields map[string]interface{})
|
||||
Inflate(key string, val interface{})
|
||||
GetStatus() string
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package measurements
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -17,6 +19,10 @@ type SmartNvmeAttribute struct {
|
||||
FailureRate float64 `json:"failure_rate,omitempty"`
|
||||
}
|
||||
|
||||
func (sa *SmartNvmeAttribute) GetStatus() string {
|
||||
return sa.Status
|
||||
}
|
||||
|
||||
func (sa *SmartNvmeAttribute) Flatten() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId,
|
||||
@@ -44,25 +50,26 @@ func (sa *SmartNvmeAttribute) Inflate(key string, val interface{}) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
////populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
//func (sa *SmartNvmeAttribute) PopulateAttributeStatus() {
|
||||
//
|
||||
// //-1 is a special number meaning no threshold.
|
||||
// if sa.Threshold != -1 {
|
||||
// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
|
||||
// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
|
||||
// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
|
||||
// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
|
||||
// sa.Status = SmartAttributeStatusFailed
|
||||
// sa.StatusReason = "Attribute is failing recommended SMART threshold"
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// //TODO: eventually figure out the critical_warning bits and determine correct error messages here.
|
||||
//
|
||||
// //check if status is blank, set to "passed"
|
||||
// if len(sa.Status) == 0 {
|
||||
// sa.Status = SmartAttributeStatusPassed
|
||||
// }
|
||||
//}
|
||||
//populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
// Chainable
|
||||
func (sa *SmartNvmeAttribute) PopulateAttributeStatus() *SmartNvmeAttribute {
|
||||
|
||||
//-1 is a special number meaning no threshold.
|
||||
if sa.Threshold != -1 {
|
||||
if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok {
|
||||
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
|
||||
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
|
||||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
|
||||
sa.Status = pkg.SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Attribute is failing recommended SMART threshold"
|
||||
}
|
||||
}
|
||||
}
|
||||
//TODO: eventually figure out the critical_warning bits and determine correct error messages here.
|
||||
|
||||
//check if status is blank, set to "passed"
|
||||
if len(sa.Status) == 0 {
|
||||
sa.Status = pkg.SmartAttributeStatusPassed
|
||||
}
|
||||
return sa
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package measurements
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -17,6 +19,10 @@ type SmartScsiAttribute struct {
|
||||
FailureRate float64 `json:"failure_rate,omitempty"`
|
||||
}
|
||||
|
||||
func (sa *SmartScsiAttribute) GetStatus() string {
|
||||
return sa.Status
|
||||
}
|
||||
|
||||
func (sa *SmartScsiAttribute) Flatten() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
fmt.Sprintf("attr.%s.attribute_id", sa.AttributeId): sa.AttributeId,
|
||||
@@ -45,23 +51,25 @@ func (sa *SmartScsiAttribute) Inflate(key string, val interface{}) {
|
||||
}
|
||||
|
||||
//
|
||||
////populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
//func (sa *SmartScsiAttribute) PopulateAttributeStatus() {
|
||||
//
|
||||
// //-1 is a special number meaning no threshold.
|
||||
// if sa.Threshold != -1 {
|
||||
// if smartMetadata, ok := metadata.NmveMetadata[sa.AttributeId]; ok {
|
||||
// //check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
|
||||
// if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
|
||||
// (smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
|
||||
// sa.Status = SmartAttributeStatusFailed
|
||||
// sa.StatusReason = "Attribute is failing recommended SMART threshold"
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// //check if status is blank, set to "passed"
|
||||
// if len(sa.Status) == 0 {
|
||||
// sa.Status = SmartAttributeStatusPassed
|
||||
// }
|
||||
//}
|
||||
//populate attribute status, using SMART Thresholds & Observed Metadata
|
||||
//Chainable
|
||||
func (sa *SmartScsiAttribute) PopulateAttributeStatus() *SmartScsiAttribute {
|
||||
|
||||
//-1 is a special number meaning no threshold.
|
||||
if sa.Threshold != -1 {
|
||||
if smartMetadata, ok := thresholds.NmveMetadata[sa.AttributeId]; ok {
|
||||
//check what the ideal is. Ideal tells us if we our recorded value needs to be above, or below the threshold
|
||||
if (smartMetadata.Ideal == "low" && sa.Value > sa.Threshold) ||
|
||||
(smartMetadata.Ideal == "high" && sa.Value < sa.Threshold) {
|
||||
sa.Status = pkg.SmartAttributeStatusFailed
|
||||
sa.StatusReason = "Attribute is failing recommended SMART threshold"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//check if status is blank, set to "passed"
|
||||
if len(sa.Status) == 0 {
|
||||
sa.Status = pkg.SmartAttributeStatusPassed
|
||||
}
|
||||
return sa
|
||||
}
|
||||
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
package metadata
|
||||
package thresholds
|
||||
|
||||
const AtaSmartAttributeDisplayTypeRaw = "raw"
|
||||
const AtaSmartAttributeDisplayTypeNormalized = "normalized"
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
package metadata
|
||||
package thresholds
|
||||
|
||||
// https://media.kingston.com/support/downloads/MKP_521.6_SMART-DCP1000_attribute.pdf
|
||||
// https://www.percona.com/blog/2017/02/09/using-nvme-command-line-tools-to-check-nvme-flash-health/
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
package metadata
|
||||
package thresholds
|
||||
|
||||
type ScsiAttributeMetadata struct {
|
||||
ID string `json:"-"`
|
||||
@@ -2,7 +2,7 @@ package handler
|
||||
|
||||
import (
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/database"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/metadata"
|
||||
"github.com/analogj/scrutiny/webapp/backend/pkg/thresholds"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/sirupsen/logrus"
|
||||
"net/http"
|
||||
@@ -23,11 +23,11 @@ func GetDeviceDetails(c *gin.Context) {
|
||||
|
||||
var deviceMetadata interface{}
|
||||
if device.IsAta() {
|
||||
deviceMetadata = metadata.AtaMetadata
|
||||
deviceMetadata = thresholds.AtaMetadata
|
||||
} else if device.IsNvme() {
|
||||
deviceMetadata = metadata.NmveMetadata
|
||||
deviceMetadata = thresholds.NmveMetadata
|
||||
} else if device.IsScsi() {
|
||||
deviceMetadata = metadata.ScsiMetadata
|
||||
deviceMetadata = thresholds.ScsiMetadata
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"success": true, "data": map[string]interface{}{"device": device, "smart_results": smartResults}, "metadata": deviceMetadata})
|
||||
|
||||
@@ -37,13 +37,23 @@ func UploadDeviceMetrics(c *gin.Context) {
|
||||
}
|
||||
|
||||
// insert smart info
|
||||
_, err = deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData)
|
||||
smartData, err := deviceRepo.SaveSmartAttributes(c, c.Param("wwn"), collectorSmartData)
|
||||
if err != nil {
|
||||
logger.Errorln("An error occurred while saving smartctl metrics", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"success": false})
|
||||
return
|
||||
}
|
||||
|
||||
if smartData.Status != pkg.DeviceStatusPassed {
|
||||
//there is a failure detected by Scrutiny, update the device status on the homepage.
|
||||
updatedDevice, err = deviceRepo.UpdateDeviceStatus(c, c.Param("wwn"), smartData.Status)
|
||||
if err != nil {
|
||||
logger.Errorln("An error occurred while updating device status", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"success": false})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// save smart temperature data (ignore failures)
|
||||
err = deviceRepo.SaveSmartTemperature(c, c.Param("wwn"), updatedDevice.DeviceProtocol, collectorSmartData)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user