# -*- coding: utf-8 -*- """ Created on Sun Feb 28 10:04:26 2016 PCA source code @author: liudiwei """ import numpy as np import traceback import pandas as pd #import matplotlib.pyplot as plt from scipy.stats import norm from scipy.stats import f from scipy.stats.distributions import chi2 import json import sys import requests import datetime import jenkspy import xlrd from smote import smote import config """ 参数: - XMat:传入的是一个numpy的矩阵格式,行表示样本数,列表示特征 - k:表示取前k个特征值对应的特征向量 返回值: """ def get_history_value(points,time,interval): #url="http://192.168.1.201:8080/openPlant/getMultiplePointHistorys" url=f"http://{config._EXA_IP}:9000/exawebapi/exatime/GetSamplingValueArrayFloat" headers = {"Content-Type": "application/json;charset=utf-8"}#,"token":get_token() point_array = points.split(",") time_span = time.split(";") value_array = [] for item in point_array: value_group = [] for time_piece in time_span: st = time_piece.split(",")[0] et = time_piece.split(",")[1] para = {"ItemName": item, "StartingTime": st, "TerminalTime": et, "SamplingPeriod": interval} response = requests.get(url, headers=headers, params=para) content = response.text.replace('"[','[').replace(']"',']') value = json.loads(content) for row in value: value_group.append(row[1]) value_array.append(value_group) return np.transpose(np.array(value_array)) def pca(XMat,k): m = np.array(XMat).shape[1] average = np.mean(XMat, axis=0) std = np.std(XMat, axis=0)#方差 m, n = np.shape(XMat) avgs = np.tile(average, (m, 1)) stds = np.tile(std, (m, 1)) data_adjust = np.divide(XMat - avgs, stds) covX = np.cov(data_adjust.T) # 计算协方差矩阵 # corr=np.corrcoef(data_adjust.T) featValue, featVec = np.linalg.eig(covX) # 求解协方差矩阵的特征值和特征向量 featValue = np.real(featValue) featVec = np.real(featVec) # ############可能协方差为0(一列数据相同,在训练的时候不要选择一样的数据,就不能除。############################ index = np.argsort(-featValue) # 按照featValue进行从大到小排序 featValue=featValue[index] featVec = featVec[:,index] featValue_sum = np.divide(featValue, np.sum(featValue)) per = 0 # 特征值百分比 # k = 0 # 主元个数 # for precent in featValue_sum: # per += precent # k = k + 1 # if per > p: # break finalData = [] if k > n: print "k must lower than feature number" return else: # 注意特征向量时列向量,而numpy的二维矩阵(数组)a[m][n]中,a[1]表示第1行值 selectVec = np.matrix(featVec[:, :k]) # 所以这里需要进行转置 finalData = np.dot(data_adjust, selectVec).dot(selectVec.T) reconData = np.add(np.multiply(finalData, stds), avgs) # 重构值 Train_X_min = np.min(XMat, axis=0) # 训练值最小值 Train_X_max = np.max(XMat, axis=0) # 训练值最大值 Train_X_mean = np.mean(XMat, axis=0) # 训练值平均值 Train_X_std = np.std(XMat, axis=0) # 训练值方差 Train_X_bais = XMat - reconData # 训练值偏差 Train_X_bais_max = np.max(np.abs(Train_X_bais), axis=0) # 训练值偏差最大值 Train_X_bais_min = np.min(np.abs(Train_X_bais), axis=0) # 训练值偏差最小值 Train_X_bais_mean = np.mean(np.abs(Train_X_bais), axis=0) # 训练值偏差平均值 Train_X_bais_std_upperB95 = np.array(np.abs(1.96 * np.std(Train_X_bais, axis=0) + Train_X_bais_mean))[ 0] # 训练值偏差标准差 Train_X_bais_std_upperB99 = np.array(np.abs(2.58* np.std(Train_X_bais, axis=0) + Train_X_bais_mean))[0] Train_X_bais_std_lowerB95 = np.array(np.abs(1.96 * np.std(Train_X_bais, axis=0) - Train_X_bais_mean))[ 0] # 训练值偏差标准差 Train_X_bais_std_lowerB99 = np.array(np.abs(2.58 * np.std(Train_X_bais, axis=0) - Train_X_bais_mean))[0] QCUL_95_line = []#限值 QCUL_99_line = [] for index1 in range(len(Train_X_bais_std_upperB95)): QCUL_95_line.append(max(Train_X_bais_std_upperB95[index1], Train_X_bais_std_lowerB95[index1])) QCUL_99_line.append(max(Train_X_bais_std_upperB99[index1], Train_X_bais_std_lowerB99[index1])) QCUL_95_line = np.array(QCUL_95_line) QCUL_99_line = np.array(QCUL_99_line) ################################################################################# # 计算阈值----------------QUCL--------------------################################################ theta1 = np.sum(featValue[k:]) theta2 = np.sum(np.power(featValue[k:], 2)) theta3 = np.sum(np.power(featValue[k:], 3)) h0 = 1 - 2 * theta1 * theta3 / (3 * np.power(theta2, 2)) ca_95 = norm.ppf(0.95, loc=0, scale=1) QCUL_95 = theta1 * np.power( h0 * ca_95 * np.sqrt(2 * theta2) / theta1 + 1 + theta2 * h0 * (h0 - 1) / np.power(theta1, 2), 1 / h0) # 置信域为百分之95 # QCUL_95_line = Train_X_bais_std*2.58 # +Train_X_mean#反归一化阈值 ca_99 = norm.ppf(0.99, loc=0, scale=1) QCUL_99 = theta1 * np.power( (h0 * ca_99 * np.sqrt(2 * theta2) / theta1 + 1 + theta2 * h0 * (h0 - 1) / np.power(theta1, 2)), 1 / h0) # 置信域为百分之99 # QCUL_99_line = Train_X_bais_std*1.96 # + Train_X_mean # 反归一化阈值 # 计算阈值----------------T2UCL--------------------########################################### f_95 = f.ppf(0.95, k, m - k) T2CUL_95 = k * (m - 1) * (m + 1) * f_95 / (m * (m - k)) # 置信域为百分之95 T2CUL_95_line = np.sqrt(T2CUL_95) * Train_X_std / np.sqrt(m) # +Train_X_mean#反归一化阈值 f_99 = f.ppf(0.99, k, m - k) T2CUL_99 = k * (m - 1) * (m + 1) * f_99 / (m * (m - k)) # 置信域为百分之99 T2CUL_99_line = np.sqrt(T2CUL_99) * Train_X_std / np.sqrt(m) # +Train_X_mean#反归一化阈值 # 计算阈值----------------综合--------------------################################################# gfi_95 = (k / pow(T2CUL_95, 2) + theta2 / pow(QCUL_95, 2)) / (k / T2CUL_95 + theta1 / QCUL_95) hfi_95 = pow((k / T2CUL_95 + theta1 / QCUL_95), 2) / (k / pow(T2CUL_95, 2) + theta2 / pow(QCUL_95, 2)) Kesi_95 = gfi_95 * chi2.ppf(0.95, hfi_95) # 卡方分布 Kesi_95_line = np.sqrt(Kesi_95) * Train_X_std / np.sqrt(m) # 反归一化阈值 gfi_99 = (k / pow(T2CUL_99, 2) + theta2 / pow(QCUL_99, 2)) / (k / T2CUL_99 + theta1 / QCUL_99) hfi_99 = pow((k / T2CUL_99 + theta1 / QCUL_99), 2) / (k / pow(T2CUL_99, 2) + theta2 / pow(QCUL_99, 2)) Kesi_99 = gfi_99 * chi2.ppf(0.99, hfi_99) # 卡方分布 Kesi_99_line = np.sqrt(Kesi_99) * Train_X_std / np.sqrt(m) # 反归一化阈值 # cos检验值 # R = per#相关性 #for index in range(0, reconData.shape[1]): #vector1 = XMat[:, index] #vector2 = np.array(reconData)[:, index] #R += np.dot(vector1, vector2.T) / (np.sqrt(np.sum(vector1 ** 2)) * np.sqrt(np.sum(vector2 ** 2))) #sR /= reconData.shape[1] ##################################################################################################################### items = [('Train_X_min', np.around(Train_X_min, decimals=3).tolist()), ('Train_X_max', np.around(Train_X_max, decimals=3).tolist()), ('Train_X_std', np.around(Train_X_std, decimals=3).tolist()), ('Train_X_mean',np.around(Train_X_mean, decimals=3).tolist()), ('Train_X_bais_max',np.around(Train_X_bais_max, decimals=3).tolist()), ('Train_X_bais_min', np.around(Train_X_bais_min, decimals=3).tolist()), ('Train_X_bais_mean',np.around(Train_X_bais_mean, decimals=3).tolist()), ('QCUL_95',QCUL_95.tolist()), ('QCUL_99', QCUL_99.tolist()), ('QCUL_95_line',np.around(QCUL_95_line, decimals=3).tolist()), ('QCUL_99_line',np.around(QCUL_99_line, decimals=3).tolist()), ('T2CUL_95', np.around(T2CUL_95, decimals=3).tolist()), ('T2CUL_99', np.around(T2CUL_99, decimals=3).tolist()), ('T2CUL_95_line', np.around(T2CUL_95_line, decimals=3).tolist()), ('T2CUL_99_line', np.around(T2CUL_99_line, decimals=3).tolist()), ('Kesi_95', np.around(Kesi_95, decimals=3).tolist()), ('Kesi_99', np.around(Kesi_99, decimals=3).tolist()), ('Kesi_95_line', np.around(Kesi_95_line, decimals=3).tolist()), ('Kesi_99_line', np.around(Kesi_99_line, decimals=3).tolist()), ('COV', np.around(covX, decimals=3).tolist()), ('K', k), ("featValue",np.around(featValue, decimals=3).tolist()), ("featVec", np.around(featVec, decimals=3).tolist()), ("selectVec", np.around(selectVec, decimals=3).tolist())] # model_info=json.dumps(dict(items)) # res_items = [('Model_info', dict(items)), ('Model_type', 'PCA')] result = dict(items) # json.dumps(result) return result def main(info): Train_Data = info["Train_Data"] points = Train_Data["points"] time = Train_Data["time"] interval = Train_Data["interval"] Hyper_para = info["Hyper_para"] percent = Hyper_para["percent"] XMat = get_history_value(points, time, interval) result = pca(XMat, percent) # 训练数据,主元百分比 result = result.replace("NaN", "-1") # 防止出现非数 解析不出来 return result def isnumber(limits): flag=True for item in limits: item=item.replace("-","") if(item.isdigit()==False): flag=False break return flag def clearmain(info): try: Train_Data = info["Train_Data"] condition=info["conditon"].replace("=","==").replace(">=",">").replace("<=","<") times = Train_Data["time"].split(';') points = Train_Data["points"].split(',') interval = Train_Data["interval"] if interval == 10000: DCount = 60 elif interval == 100000: DCount = 6 elif interval == 300000: DCount = 5 else: DCount = 4 dead = Train_Data["dead"].split(',') limit = Train_Data["limit"].split(',') uplower = Train_Data["uplow"].split(';') percent = info["Hyper_para"]["percent"] count=0 ItemsInfo, SamplingTimePeriods = [], [] Constraint = "" for i in range(len(points)): iteminfo = {} iteminfo["ItemName"] = points[i] # 加点 if (dead[i] == "1"): # 判断是否参与死区清洗 iteminfo["ClearDeadZone"] = "true" else: iteminfo["ClearDeadZone"] = "false" if (limit[i] == "1"): # 参与上下限清洗 limits = uplower[i].split(',') if (isnumber(limits) == True): # 输入上下限正确 count += 1 Constraint += "[" + points[i] + "]>" + limits[0] + " and " + "[" + points[i] + "]<" + limits[1] + " and " ItemsInfo.append(iteminfo) if(count!=0): Constraint = Constraint[:len(Constraint) - 4:] else: Constraint="1==1"#没有上下限清洗 Constraint+=" and ("+condition+")" for i in range(len(times)): Eachsampletime = {} timess = times[i].split(',') Eachsampletime["StartingTime"] = timess[0] Eachsampletime["TerminalTime"] = timess[1] SamplingTimePeriods.append(Eachsampletime) Constraint = Constraint.replace("\n", " ") url = f"http://{config._CLEAN_IP}/exawebapi/exatime/GetCleaningData?ItemsInfo=%s&SamplingTimePeriods=%s&Constraint=%s&SamplingPeriod=%s&DCount=%d" % ( ItemsInfo, SamplingTimePeriods, Constraint, interval, DCount) response = requests.get(url) content = json.loads(response.text) data = np.array([item for item in content["ClearData"]]).T try: smote_data = info["smote"] except KeyError: smote_data = False if smote_data: smote_index = [points.index(item["smote_point"]) for item in info["smote_config"]] smote_num = [item["break_point_num"] for item in info["smote_config"]] data = smote(data, smote_index, smote_num) result = pca(data, percent) result = result.replace("NaN", "-1") result=json.loads(result) result["BeforeCleanSamNum"]=content["BeforeCleanSamNum"] result["AfterCleanSamNum"]=content["AfterCleanSamNum"] result["CleanOrNot"] = True return json.dumps(result) except Exception as e: result = [{"CleanOrNot": False}] return json.dumps(result) if __name__ == "__main__": worksheet = pd.read_excel("train_data_1.xlsx", sheet_name="Sheet1") data = [item for _, item in worksheet.iterrows()] pca(data, 3) # info_str = '{"Train_Data":{"time":"2020-01-12 19:35:34,2020-01-13 04:29:48;2020-01-13 05:46:07,2020-01-13 13:18:00","points":"JL_D1_10DAS05B:MAD10CY102A,JL_D1_10DAS05B:MAD10CY103A,JL_D1_10DAS05B:MAD20CY102A,JL_D1_10DAS05B:MAD20CY103A,JL_D1_10DAS05B:MAD30CY102A,JL_D1_10DAS05B:MAD30CY103A,JL_D1_10DAS05B:MAD40CY102A,JL_D1_10DAS05B:MAD40CY103A,JL_D1_10DAS05B:MAD50CY102A,JL_D1_10DAS05B:MAD50CY103A,JL_D1_10DAS05B:MAD60CY102A,JL_D1_10DAS05B:MAD60CY103A,JL_D1_10DAS05B:MAD70CY102A,JL_D1_10DAS05B:MAD70CY103A,JL_D1_10DAS05B:MAD80CY102A,JL_D1_10DAS05B:MAD80CY103A,JL_D1_10DAS05B:MAD90CY102A,JL_D1_10DAS05B:MAD90CY103A,JL_D1_10MCS07A:MAV15CT301,JL_D1_10DAS09A:MAV15CP101.PNT,JL_D1_10DAS25B:10MKA01CE001.PNT","interval":300000,"dead":"1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1","limit":"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0","uplow":"0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,500;0,100;0,0.4;0,866"},"Hyper_para":{"percent":0.85},"type":"PCA","conditon":"[JL_D1_10DAS25B:10MKA01CE001.PNT]>0 and [JL_D1_1OA_IO:GV13PZCHAR]<5 and [JL_D1_1OA_IO:GV24PZCHAR]<5"}' # info = json.loads(info_str) # res = json.loads(clearmain(info)) # print("aaa")