This will delete the page "Nextcloud Code"
. Please be certain.
import pandas as pd import numpy as np import string from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, f1_score from imblearn.over_sampling import ADASYN from sklearn.ensemble import IsolationForest, RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans from sklearn.decomposition import PCA # Import PCA from sklearn.feature_selection import VarianceThreshold from sklearn.impute import SimpleImputer from gensim.models import Word2Vec from sklearn.decomposition import LatentDirichletAllocation
file_path = 'D:/codes/logs/Nextcloud PHP Error Logs/nextcloudfullnew.csv' df = pd.read_csv(file_path, low_memory=False)
print(df.info()) print(df.head())
df['timestamp'] = pd.to_datetime(df['timestamp']) df.dropna(subset=['timestamp'], inplace=True)
df['day'] = df['timestamp'].dt.day df['hour'] = df['timestamp'].dt.hour
level_mapping = {0: 'DEBUG', 1: 'INFO', 2: 'NOTICE', 3: 'WARNING', 4: 'ERROR', 5: 'CRITICAL', 6: 'ALERT', 7: 'EMERGENCY'} df['level_category'] = df['level'].map(level_mapping)
df['accessed_resource'] = df['url'].str.extract(r'/([^/]*)$')
def simplify_exception_type(exception_type):
if pd.isnull(exception_type):
return 'Unknown'
elif 'ServiceUnavailable' in exception_type:
return 'Service Unavailable'
elif 'LockWaitTimeout' in exception_type:
return 'Lock Wait Timeout'
elif 'PDOException' in exception_type:
return 'PDO Exception'
else:
return 'Other'
df['simple_exception_type'] = df['exception_type'].apply(simplify_exception_type)
df['file_extension'] = df['file_path'].str.extract(r'.([^.]+)$')
df.fillna({'accessed_resource': 'Unknown', 'simple_exception_type': 'Unknown', 'file_extension': 'None'}, inplace=True)
session_threshold = pd.Timedelta(minutes=30) df = df.sort_values(by='timestamp') df['session_id'] = (df['timestamp'].diff() > session_threshold).cumsum() + 1
stop_words = set(stopwords.words('english')) ps = PorterStemmer() lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
text = text.lower().translate(str.maketrans('', '', string.punctuation))
tokenized = word_tokenize(text)
lemmatized = [lemmatizer.lemmatize(word) for word in tokenized if word not in stop_words]
return ' '.join(lemmatized)
df['preprocessed_message'] = df['custom_message'].fillna('').astype(str).apply(preprocess_text)
df['message_length'] = df['custom_message'].fillna('').apply(len)
session_durations = df.groupby('session_id')['timestamp'].apply(lambda x: (x.max() - x.min()).total_seconds())
request_frequency = df.groupby('session_id').size() / session_durations
unique_requests = df.groupby('session_id')['url'].nunique()
df['request_sequence'] = df.groupby('session_id')['url'].transform(lambda x: ' '.join(x))
user_session_lengths = df.groupby('user')['session_id'].nunique() user_preferred_times = df.groupby(['user', 'hour']).size()
sessions_per_user = df.groupby('user')['session_id'].nunique() sessions_per_day = df.groupby(df['timestamp'].dt.date)['session_id'].nunique()
def extract_cron_sync(message):
if "Cron mail sync failed" in message:
return 1
else:
return 0
df['cron_sync_failed'] = df['custom_message'].fillna('').apply(extract_cron_sync)
def extract_login_failed(prev_message):
if pd.notnull(prev_message) and "Login failed" in prev_message:
return 1
else:
return 0
def extract_configuration_error(prev_message):
if pd.notnull(prev_message) and "Configuration Error" in prev_message:
return 1
else:
return 0
df['login_failed'] = df['prev_message'].fillna('').apply(extract_login_failed) df['configuration_error'] = df['prev_message'].fillna('').apply(extract_configuration_error)
print("Missing values before handling:") print(df.isnull().sum())
df['time_since_last_request'] = df.groupby('session_id')['timestamp'].diff().dt.total_seconds()
df['avg_request_frequency'] = df.groupby('session_id')['time_since_last_request'].transform('mean')
df['user_session_count'] = df['user'].map(user_session_lengths)
from scipy.stats import entropy df['user_entropy'] = df.groupby('user')['session_id'].transform(lambda x: entropy(x.value_counts(normalize=True)))
df['hour_of_day'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['time_since_last_request'] = df.groupby('session_id')['timestamp'].diff().dt.total_seconds()
df['UserSession_MessageLength_Product'] = df['user_session_count'] * df['message_length'] df['AvgRequestFrequency_per_MessageLength'] = df['avg_request_frequency'] / df['message_length']
text_corpus = df['preprocessed_message'].apply(str.split).tolist() word2vec_model = Word2Vec(text_corpus, vector_size=100, window=5, min_count=1, workers=4) word_vectors = word2vec_model.wv
df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5
df['is_peak_hours'] = df['hour'].isin(range(8, 18))
df['session_duration'] = (df.groupby('session_id')['timestamp'].transform('max') - df.groupby('session_id')['timestamp'].transform('min')).dt.total_seconds()
df['unique_request_count'] = df.groupby('session_id')['url'].transform('nunique')
df['most_frequent_request'] = df.groupby('session_id')['url'].transform(lambda x: x.mode()[0])
df['longest_request_path'] = df.groupby('session_id')['url'].transform(lambda x: x.apply(len).max())
df['average_request_length'] = df.groupby('session_id')['url'].transform(lambda x: x.apply(len).mean())
df['request_sequence_entropy'] = df.groupby('session_id')['url'].transform(lambda x: entropy(x.value_counts(normalize=True)))
sessions_per_user_per_day = df.groupby(['user', df['timestamp'].dt.date])['session_id'].nunique() df['sessions_per_user_per_day'] = df.apply(lambda row: sessions_per_user_per_day.get((row['user'], row['timestamp'].date()), 0), axis=1)
average_session_duration_per_user = df.groupby('user')['session_duration'].mean() df['average_session_duration_per_user'] = df['user'].map(average_session_duration_per_user) df['session_activity'] = df.groupby('session_id')['url'].transform('count')
average_session_activity_per_user = df.groupby('user')['session_activity'].mean() df['average_session_activity_per_user'] = df['user'].map(average_session_activity_per_user)
df['session_frequency'] = df.groupby('session_id')['time_since_last_request'].transform('mean')
average_session_frequency_per_user = df.groupby('user')['session_frequency'].mean() df['average_session_frequency_per_user'] = df['user'].map(average_session_frequency_per_user)
kmeans_user = KMeans(n_clusters=3) df['user_segment'] = kmeans_user.fit_predict(df[['session_duration', 'session_activity']])
user_preferences = df.groupby('user')[['session_duration', 'session_activity']].mean()
user_engagement = df.groupby('user')['session_duration'].sum() / df.groupby('user')['session_activity'].sum() df['user_engagement'] = df['user'].map(user_engagement)
scaler = StandardScaler() numerical_features = [ 'session_duration', 'session_activity', 'user_engagement'] df[numerical_features] = scaler.fit_transform(df[numerical_features])
df['session_duration_activity'] = df['session_duration'] * df['session_activity'] df['session_frequency_duration'] = df['session_frequency'] * df['session_duration']
df['user_segment_activity'] = df['user_segment'] * df['session_activity']
df['user_engagement_duration'] = df['user_engagement'] * df['session_duration']
def extract_imap_failure(message):
return 1 if "Error connecting to mail server" in message else 0
df['Anomaly'] = df['level'] >= 4
import numpy as np
df = df.replace([np.inf, -np.inf], np.nan)
print(df.isna().sum())
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist() numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist() boolean_features = [col for col in numerical_features if set(df[col].dropna().unique()).issubset({0, 1})] numerical_features = [col for col in numerical_features if col not in boolean_features]
datetime_features = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
for feature in categorical_features:
df[feature] = df[feature].fillna(df[feature].mode()[0])
for column in numerical_features:
df[column] = df[column].fillna(df[column].median())
for feature in datetime_features:
df[feature] = df[feature].fillna(method='ffill')
for column in boolean_features:
df[column] = df[column].fillna(0)
for column in boolean_features:
df[column] = df[column].astype(int)
print("Missing values after handling:") print(df.isnull().sum())
columns_to_drop = ['userAgent', 'version','reqId','line_number','day','hour','session_id','hour_of_day','day_of_week'
,'timestamp','file_path','app','user_agent','url','file_extension','level','user','trace' ,'method'
,'level_category','preprocessed_message','login_failed','configuration_error','is_weekend'
,'is_peak_hours','user_entropy','accessed_resource','user_session_count','session_duration'
,'time_since_last_request','simple_exception_type','most_frequent_request','unique_request_count'
,'prev_code','exception_code','exception_type' ,'exception_message' ,'custom_message' ,'prev_exception' ,'prev_message' ,'prev_trace']
df = df.drop(columns=columns_to_drop, errors='ignore')
print("DataFrame after dropping columns:") print(df.info())
print(df.isnull().sum()) print("DataFrame size:", df.shape)
#Feature Selection and Dimensionality Reduction with PCA import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve import matplotlib.pyplot as plt
numerical_features = df.select_dtypes(include=[np.number]).columns numerical_features = [col for col in numerical_features if col not in ['Anomaly']]
import seaborn as sns import matplotlib.pyplot as plt
correlation_matrix = df[numerical_features].corr(method='pearson') # You can also use 'spearman' if non-linear relationships are expected
plt.figure(figsize=(12, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title('Feature Correlation Matrix') plt.show()
feature_anomaly_corr = df[numerical_features].apply(lambda x: x.corr(df['Anomaly'], method='pearson'))
print("Correlation with 'Anomaly':") print(feature_anomaly_corr)
imputer = SimpleImputer(strategy='mean') df_imputed = imputer.fit_transform(df[numerical_features])
df_standardized = StandardScaler().fit_transform(df_imputed)
import matplotlib.pyplot as plt
pca = PCA().fit(df_standardized) # Assuming df_standardized is your feature matrix
plt.figure(figsize=(8, 5)) plt.plot(np.cumsum(pca.explained_varianceratio)) plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.title('Explained Variance by Components') plt.grid(True) plt.show()
pca = PCA(n_components=20) pca_result = pca.fit_transform(df_standardized)
feature_names = df[numerical_features].columns pca_loadingsdf = pd.DataFrame(pca.components.T, index=feature_names, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
print("PCA Loadings:") print(pca_loadings_df)
from keras.models import Sequential from keras.layers import LSTM, Dense
X_train, X_test, y_train, y_test = train_test_split(pca_result, df['Anomaly'], test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=10) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
iso_forest = IsolationForest(n_estimators=50, contamination=0.05, random_state=42) iso_forest.fit(X_train_resampled)
y_pred_iso = iso_forest.predict(X_test)
accuracy_iso = accuracy_score(y_test, (y_pred_iso == -1).astype(int)) print("Isolation Forest Accuracy:", accuracy_iso)
from sklearn.inspection import permutation_importance
result = permutation_importance(iso_forest, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')
feature_importances = result.importances_mean
for feature, importance in zip(feature_names, feature_importances):
print(f"{feature}: {importance}")
X_train_lstm = np.array(X_train_resampled).reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1])) X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))
from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.regularizers import l1_l2
model = Sequential() model.add(LSTM(10, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),
kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) # Increased units, added L2 regularization
model.add(Dropout(0.2)) # Adding dropout model.add(LSTM(5, kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=64, validation_split=0.25) # Adjusted epochs, batch size, and validation split
y_pred_lstm_prob = model.predict(X_test_lstm) y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)
y_pred_iso = iso_forest.predict(X_test)
y_pred_iso_binary = (y_pred_iso == -1).astype(int)
y_pred_combined = np.logical_or(y_pred_lstm.flatten(), y_pred_iso_binary)
print("Combined Performance:") print(classification_report(y_test, y_pred_combined)) print("ROC AUC score:", roc_auc_score(y_test, y_pred_combined))
fpr, tpr, _ = roc_curve(y_test, y_pred_combined)
plt.figure(figsize=(10, 6)) plt.plot(fpr, tpr, label='Combined ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve for Combined Model') plt.legend() plt.show()
import numpy as np import tensorflow as tf
def integrated_gradients(inputs, model, baseline=None, num_steps=50):
if baseline is None:
baseline = np.zeros_like(inputs)
# Create a linear path from baseline to the inputs
alphas = np.linspace(0, 1, num_steps)
# Initialize an empty list to store integrated gradients
integrated_grads = []
# Convert inputs and baseline to TensorFlow tensors
inputs_tf = tf.convert_to_tensor(inputs, dtype=tf.float32)
baseline_tf = tf.convert_to_tensor(baseline, dtype=tf.float32)
# Compute gradients and integrate along the path
for alpha in alphas:
interpolated_input = baseline_tf + alpha * (inputs_tf - baseline_tf)
with tf.GradientTape() as tape:
tape.watch(interpolated_input)
predictions = model(interpolated_input)
grads = tape.gradient(predictions, interpolated_input)
integrated_grads.append(grads)
# Approximate the integral using the trapezoidal rule
integrated_grads = np.array(integrated_grads)
avg_grads = np.mean(integrated_grads, axis=0)
integrated_gradients = (inputs_tf - baseline_tf) * avg_grads
return integrated_gradients
sample_idx = 0 # Choose the index of the sample you want to explain sample = X_test_lstm[sample_idx:sample_idx+1] baseline = np.zeros_like(sample) # You can choose a different baseline if needed ig = integrated_gradients(sample, model, baseline=baseline)
feature_names = numerical_features # Use your feature names here
print("Length of feature_names:", len(feature_names)) print("Length of feature importances:", len(np.abs(ig[0].numpy().mean(axis=0))))
plt.figure(figsize=(10, 6)) plt.bar(feature_names, np.abs(ig[0].numpy().mean(axis=0))) plt.title('Integrated Gradients - Feature Importance') plt.xlabel('Features') plt.ylabel('Integrated Gradients') plt.xticks(rotation=45, ha='right') plt.show()
if_max, if_min = max(feature_importances), min(feature_importances) normalized_if_importances = (feature_importances - if_min) / (if_max - if_min)
ig_mean = np.mean(np.abs(ig[0].numpy()), axis=0) ig_max, ig_min = max(ig_mean), min(ig_mean) normalized_ig = (ig_mean - ig_min) / (ig_max - ig_min)
combined_importances = normalized_if_importances + normalized_ig
plt.figure(figsize=(10, 6)) plt.bar(feature_names, combined_importances) plt.title('Combined Feature Importances') plt.xlabel('Features') plt.ylabel('Combined Importances') plt.xticks(rotation=45, ha='right') plt.show()
importance_pairs = zip(feature_names, combined_importances)
for feature, importance in importance_pairs:
print(f'{feature}: {importance:.2f}')
Welcome to Wiki!
This will delete the page "Nextcloud Code"
. Please be certain.