You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

305 lines
15 KiB

# -*- coding: utf-8 -*-
"""
Created on Sun Feb 28 10:04:26 2016
PCA source code
@author: liudiwe
"""
import numpy as np
import traceback
import pandas as pd
from json import JSONDecodeError
#import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import f
from scipy.stats.distributions import chi2
import json
import sys
import requests
import datetime
import jenkspy
import xlrd
from smote import smote
import config
"""
参数
- XMat传入的是一个numpy的矩阵格式行表示样本数列表示特征
- k表示取前k个特征值对应的特征向量
返回值
"""
def get_history_value(points,time,interval):
#url="http://192.168.1.201:8080/openPlant/getMultiplePointHistorys"
url=f"http://{config._EXA_IP}:9000/exawebapi/exatime/GetSamplingValueArrayFloat"
headers = {"Content-Type": "application/json;charset=utf-8"}#,"token":get_token()
point_array = points.split(",")
time_span = time.split(";")
value_array = []
for item in point_array:
value_group = []
for time_piece in time_span:
st = time_piece.split(",")[0]
et = time_piece.split(",")[1]
para = {"ItemName": item, "StartingTime": st, "TerminalTime": et, "SamplingPeriod": interval}
response = requests.get(url, headers=headers, params=para)
content = response.text.replace('"[','[').replace(']"',']')
value = json.loads(content)
for row in value:
value_group.append(row[1])
value_array.append(value_group)
return np.transpose(np.array(value_array))
def pca(XMat, p):
m = np.array(XMat).shape[1]#取参数个数,即矩阵列数
average = np.mean(XMat, axis=0)#axis=0,分别对各列求均值
std = np.std(XMat, axis=0)#axis=0,计算各列的标准差
m, n = np.shape(XMat)#取矩阵的行数m和列数n
avgs = np.tile(average, (m, 1))#将average的数乘1倍,再以此为基,复制到每一行,变成m行的矩阵
stds = np.tile(std, (m, 1))#将std的数乘1倍,再以此为基,复制到每一行,变成m行的矩阵
data_adjust = np.divide(XMat - avgs, stds)#计算XMat-avgs,即每个元素与列平均值之间的差,再用此差除以每列数据的标准差
covX = np.cov(data_adjust.T) # 计算协方差矩阵(对称阵)
# corr=np.corrcoef(data_adjust.T)
featValue, featVec = np.linalg.eig(covX) # 求解协方差矩阵的特征值和特征向量
# ############可能协方差为0(一列数据相同,在训练的时候不要选择一样的数据,就不能除。############################
featValue=np.real(featValue)#返回复杂参数的实部
featVec=np.real(featVec)#返回复杂参数的实部
index = np.argsort(-featValue) # 按照featValue进行从大到小排序 -featValue转置后把元素变成其相反数
featValue=featValue[index]#按照排序进行重建
featVec = featVec[:,index]
featValue_sum = np.divide(featValue, np.sum(featValue))#特征值分别除以特征值之和
per = 0 # 特征值百分比
k = 0 # 主元个数
for precent in featValue_sum:
per += precent
k = k + 1
if per > p:
break
#记录主元数小与设定值p的总值和个数
finalData = []
if k > n:#如果k比总数大,就返回 k必须小于特征数
print
"k must lower than feature number"
return
else:
# 注意特征向量是列向量,而numpy的二维矩阵(数组)a[m][n]中,a[1]表示第1行值
selectVec = np.matrix(featVec[:, :k]) # 所以这里需要进行转置 取第k列的特征向量,将ndarray对象转为matrix矩阵
finalData = np.dot(data_adjust, selectVec).dot(selectVec.T)#将data_adjust,selectVec,selectVec.T三者相乘
reconData = np.add(np.multiply(finalData, stds), avgs) # 重构值 将finalData和stds相乘,然后与avgs相加
Train_X_min = np.min(XMat, axis=0) # 训练值最小值
Train_X_max = np.max(XMat, axis=0) # 训练值最大值
Train_X_mean = np.mean(XMat, axis=0) # 训练值平均值
Train_X_std = np.std(XMat, axis=0) # 训练值方差
Train_X_bais = XMat - reconData # 训练值偏差
Train_X_bais_max = np.max(np.abs(Train_X_bais), axis=0) # 训练值偏差最大值 axis=0 对各列求
Train_X_bais_min = np.min(np.abs(Train_X_bais), axis=0) # 训练值偏差最小值
Train_X_bais_mean = np.mean(np.abs(Train_X_bais), axis=0) # 训练值偏差平均值
Train_X_bais_std_upperB95 = np.array(np.abs(1.96 * np.std(Train_X_bais, axis=0) + Train_X_bais_mean))[
0] # 训练值偏差标准差
Train_X_bais_std_upperB99 = np.array(np.abs(2.58* np.std(Train_X_bais, axis=0) + Train_X_bais_mean))[0]
Train_X_bais_std_lowerB95 = np.array(np.abs(1.96 * np.std(Train_X_bais, axis=0) - Train_X_bais_mean))[
0] # 训练值偏差标准差
Train_X_bais_std_lowerB99 = np.array(np.abs(2.58 * np.std(Train_X_bais, axis=0) - Train_X_bais_mean))[0]
QCUL_95_line = []#限值
QCUL_99_line = []
for index1 in range(len(Train_X_bais_std_upperB95)):
QCUL_95_line.append(max(Train_X_bais_std_upperB95[index1], Train_X_bais_std_lowerB95[index1]))
QCUL_99_line.append(max(Train_X_bais_std_upperB99[index1], Train_X_bais_std_lowerB99[index1]))
QCUL_95_line = np.array(QCUL_95_line)
QCUL_99_line = np.array(QCUL_99_line)
#################################################################################
# 计算阈值----------------QUCL--------------------################################################
theta1 = np.sum(featValue[k:])
theta2 = np.sum(np.power(featValue[k:], 2))
theta3 = np.sum(np.power(featValue[k:], 3))
h0 = 1 - 2 * theta1 * theta3 / (3 * np.power(theta2, 2))
ca_95 = norm.ppf(0.95, loc=0, scale=1)
QCUL_95 = theta1 * np.power(
h0 * ca_95 * np.sqrt(2 * theta2) / theta1 + 1 + theta2 * h0 * (h0 - 1) / np.power(theta1, 2),
1 / h0) # 置信域为百分之95
# QCUL_95_line = Train_X_bais_std*2.58 # +Train_X_mean#反归一化阈值
ca_99 = norm.ppf(0.99, loc=0, scale=1)
QCUL_99 = theta1 * np.power(
(h0 * ca_99 * np.sqrt(2 * theta2) / theta1 + 1 + theta2 * h0 * (h0 - 1) / np.power(theta1, 2)),
1 / h0) # 置信域为百分之99
# QCUL_99_line = Train_X_bais_std*1.96 # + Train_X_mean # 反归一化阈值
# 计算阈值----------------T2UCL--------------------###########################################
f_95 = f.ppf(0.95, k, m - k)
T2CUL_95 = k * (m - 1) * (m + 1) * f_95 / (m * (m - k)) # 置信域为百分之95
T2CUL_95_line = np.sqrt(T2CUL_95) * Train_X_std / np.sqrt(m) # +Train_X_mean#反归一化阈值
f_99 = f.ppf(0.99, k, m - k)
T2CUL_99 = k * (m - 1) * (m + 1) * f_99 / (m * (m - k)) # 置信域为百分之99
T2CUL_99_line = np.sqrt(T2CUL_99) * Train_X_std / np.sqrt(m) # +Train_X_mean#反归一化阈值
# 计算阈值----------------综合--------------------#################################################
gfi_95 = (k / pow(T2CUL_95, 2) + theta2 / pow(QCUL_95, 2)) / (k / T2CUL_95 + theta1 / QCUL_95)
hfi_95 = pow((k / T2CUL_95 + theta1 / QCUL_95), 2) / (k / pow(T2CUL_95, 2) + theta2 / pow(QCUL_95, 2))
Kesi_95 = gfi_95 * chi2.ppf(0.95, hfi_95) # 卡方分布
Kesi_95_line = np.sqrt(Kesi_95) * Train_X_std / np.sqrt(m) # 反归一化阈值
gfi_99 = (k / pow(T2CUL_99, 2) + theta2 / pow(QCUL_99, 2)) / (k / T2CUL_99 + theta1 / QCUL_99)
hfi_99 = pow((k / T2CUL_99 + theta1 / QCUL_99), 2) / (k / pow(T2CUL_99, 2) + theta2 / pow(QCUL_99, 2))
Kesi_99 = gfi_99 * chi2.ppf(0.99, hfi_99) # 卡方分布
Kesi_99_line = np.sqrt(Kesi_99) * Train_X_std / np.sqrt(m) # 反归一化阈值
# cos检验值
R = per#相关性
#for index in range(0, reconData.shape[1]):
#vector1 = XMat[:, index]
#vector2 = np.array(reconData)[:, index]
#R += np.dot(vector1, vector2.T) / (np.sqrt(np.sum(vector1 ** 2)) * np.sqrt(np.sum(vector2 ** 2)))
#sR /= reconData.shape[1]
#####################################################################################################################
items = [('Train_X_min', np.around(Train_X_min, decimals=3).tolist()),
('Train_X_max', np.around(Train_X_max, decimals=3).tolist()),
('Train_X_std', np.around(Train_X_std, decimals=3).tolist()),
('Train_X_mean',np.around(Train_X_mean, decimals=3).tolist()),
('Train_X_bais_max',np.around(Train_X_bais_max, decimals=3).tolist()),
('Train_X_bais_min', np.around(Train_X_bais_min, decimals=3).tolist()),
('Train_X_bais_mean',np.around(Train_X_bais_mean, decimals=3).tolist()),
('QCUL_95',np.around(QCUL_95, decimals=10).tolist()),
('QCUL_99', np.around(QCUL_99, decimals=10).tolist()),
('QCUL_95_line',np.around(QCUL_95_line, decimals=3).tolist()),
('QCUL_99_line',np.around(QCUL_99_line, decimals=3).tolist()),
('T2CUL_95', np.around(T2CUL_95, decimals=3).tolist()),
('T2CUL_99', np.around(T2CUL_99, decimals=3).tolist()),
('T2CUL_95_line', np.around(T2CUL_95_line, decimals=3).tolist()),
('T2CUL_99_line', np.around(T2CUL_99_line, decimals=3).tolist()),
('Kesi_95', np.around(Kesi_95, decimals=3).tolist()),
('Kesi_99', np.around(Kesi_99, decimals=3).tolist()),
('Kesi_95_line', np.around(Kesi_95_line, decimals=3).tolist()),
('Kesi_99_line', np.around(Kesi_99_line, decimals=3).tolist()),
('COV', np.around(covX, decimals=3).tolist()),
('K', k),
('R', np.around(R,decimals=3).tolist()),
("featValue",np.around(featValue, decimals=3).tolist()),
("featVec", np.around(featVec, decimals=3).tolist()),
("selectVec", np.around(selectVec, decimals=3).tolist())]
# model_info=json.dumps(dict(items))
res_items = [('Model_info', dict(items)), ('Model_type', 'PCA')]
result = dict(res_items) # json.dumps(result)
return json.dumps(result)
def main(info):
Train_Data = info["Train_Data"]
points = Train_Data["points"]
time = Train_Data["time"]
interval = Train_Data["interval"]
Hyper_para = info["Hyper_para"]
percent = Hyper_para["percent"]
XMat = get_history_value(points, time, interval)
result = pca(XMat, percent) # 训练数据,主元百分比
result = result.replace("NaN", "-1") # 防止出现非数 解析不出来
return result
def isnumber(limits):
flag=True
for item in limits:
item=item.replace("-","")
if(item.isdigit()==False):
flag=False
break
return flag
def clearmain(info):
try:
Train_Data = info["Train_Data"]
condition=info["conditon"].replace("=","==").replace(">=",">").replace("<=","<")
times = Train_Data["time"].split(';')
points = Train_Data["points"].split(',')
interval = Train_Data["interval"]
if interval == 10000:
DCount = 60
elif interval == 100000:
DCount = 6
elif interval == 300000:
DCount = 5
else:
DCount = 4
dead = Train_Data["dead"].split(',')
limit = Train_Data["limit"].split(',')
uplower = Train_Data["uplow"].split(';')
percent = info["Hyper_para"]["percent"]
count=0
ItemsInfo, SamplingTimePeriods = [], []
Constraint = ""
for i in range(len(points)):
iteminfo = {}
iteminfo["ItemName"] = points[i] # 加点
if (dead[i] == "1"): # 判断是否参与死区清洗
iteminfo["ClearDeadZone"] = "true"
else:
iteminfo["ClearDeadZone"] = "false"
if (limit[i] == "1"): # 参与上下限清洗
limits = uplower[i].split(',')
if (isnumber(limits) == True): # 输入上下限正确 isnumber 是否为数字
count += 1
Constraint += "[" + points[i] + "]>" + limits[0] + " and " + "[" + points[i] + "]<" + limits[1] + " and "
ItemsInfo.append(iteminfo)
if(count!=0):
Constraint = Constraint[:len(Constraint) - 4:]
else:
Constraint="1==1"#没有上下限清洗
Constraint+=" and ("+condition+")"
for i in range(len(times)):
Eachsampletime = {}
timess = times[i].split(',')
Eachsampletime["StartingTime"] = timess[0]
Eachsampletime["TerminalTime"] = timess[1]
SamplingTimePeriods.append(Eachsampletime)
Constraint = Constraint.replace("\n", " ")
url = f"http://{config._CLEAN_IP}/exawebapi/exatime/GetCleaningData?ItemsInfo=%s&SamplingTimePeriods=%s&Constraint=%s&SamplingPeriod=%s&DCount=%d" % (
ItemsInfo, SamplingTimePeriods, Constraint, interval, DCount)
response = requests.get(url)
content = json.loads(response.text)
data = np.array([item for item in content["ClearData"]]).T
try:
smote_data = info["smote"]
# smote_data = False
except KeyError:
smote_data = False
if smote_data:
try:
smote_index = [points.index(item["pointId"]) for item in info["smote_config"] if item["LAY_CHECKED"]]
smote_num = [int(item["number"]) for item in info["smote_config"] if item["LAY_CHECKED"]]
max_value = [float(item["max"]) for item in info["smote_config"] if item["LAY_CHECKED"]]
min_value = [float(item["min"]) for item in info["smote_config"] if item["LAY_CHECKED"]]
except KeyError:
pass
else:
if len(smote_num) != 0:
data, *_ = smote(data, smote_index, smote_num, max_value, min_value)
result = pca(data, percent)
result = result.replace("NaN", "-1")
result=json.loads(result)
result["BeforeCleanSamNum"]=content["BeforeCleanSamNum"]
result["AfterCleanSamNum"]=content["AfterCleanSamNum"]
result["CleanOrNot"] = True
return json.dumps(result)
except Exception as e:
result = [{"CleanOrNot": False, "msg": traceback.format_exc()}]
return json.dumps(result, ensure_ascii=False)
if __name__ == "__main__":
info_str = r'{"Train_Data":{"time":"2020-08-02 00:40:00,2020-08-02 07:36:03;2020-08-05 15:20:43,2020-08-05 18:45:46","points":"JL_D2_20DAS05A:LAV10CE101.PNT,JL_D2_20DAS05A:LAC10CE101.PNT,JL_D2_20DAS11A:HAG41CE101.PNT","interval":300000,"dead":"1,1,1","limit":"0,0,0","uplow":"null,null;null,null;null,null"},"Hyper_para":{"percent":0.94375},"type":"PCA","conditon":"1=1","smote_config":[],"smote":true,"target_point":null}'
info = json.loads(info_str)
res = json.loads(clearmain(info))
print("aaa")