Nextcloud Code
Amr edited this page 2 months ago

import pandas as pd import numpy as np import string from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, precision_score, recall_score, f1_score from imblearn.over_sampling import ADASYN from sklearn.ensemble import IsolationForest, RandomForestClassifier

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans from sklearn.decomposition import PCA # Import PCA from sklearn.feature_selection import VarianceThreshold from sklearn.impute import SimpleImputer from gensim.models import Word2Vec from sklearn.decomposition import LatentDirichletAllocation

Load the Data from CSV

file_path = 'D:/codes/logs/Nextcloud PHP Error Logs/nextcloudfullnew.csv' df = pd.read_csv(file_path, low_memory=False)

Step 2: Explore the Data

print(df.info()) print(df.head())

Data Cleaning

df['timestamp'] = pd.to_datetime(df['timestamp']) df.dropna(subset=['timestamp'], inplace=True)

Feature Engineering

df['day'] = df['timestamp'].dt.day df['hour'] = df['timestamp'].dt.hour

level_mapping = {0: 'DEBUG', 1: 'INFO', 2: 'NOTICE', 3: 'WARNING', 4: 'ERROR', 5: 'CRITICAL', 6: 'ALERT', 7: 'EMERGENCY'} df['level_category'] = df['level'].map(level_mapping)

df['accessed_resource'] = df['url'].str.extract(r'/([^/]*)$')

def simplify_exception_type(exception_type):

if pd.isnull(exception_type):
    return 'Unknown'
elif 'ServiceUnavailable' in exception_type:
    return 'Service Unavailable'
elif 'LockWaitTimeout' in exception_type:
    return 'Lock Wait Timeout'
elif 'PDOException' in exception_type:
    return 'PDO Exception'
else:
    return 'Other'

df['simple_exception_type'] = df['exception_type'].apply(simplify_exception_type)

df['file_extension'] = df['file_path'].str.extract(r'.([^.]+)$')

df.fillna({'accessed_resource': 'Unknown', 'simple_exception_type': 'Unknown', 'file_extension': 'None'}, inplace=True)

Session Analysis

session_threshold = pd.Timedelta(minutes=30) df = df.sort_values(by='timestamp') df['session_id'] = (df['timestamp'].diff() > session_threshold).cumsum() + 1

Text Preprocessing

stop_words = set(stopwords.words('english')) ps = PorterStemmer() lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

text = text.lower().translate(str.maketrans('', '', string.punctuation))
tokenized = word_tokenize(text)
lemmatized = [lemmatizer.lemmatize(word) for word in tokenized if word not in stop_words]
return ' '.join(lemmatized)

df['preprocessed_message'] = df['custom_message'].fillna('').astype(str).apply(preprocess_text)

Additional Feature Engineering

df['message_length'] = df['custom_message'].fillna('').apply(len)

Session Duration

session_durations = df.groupby('session_id')['timestamp'].apply(lambda x: (x.max() - x.min()).total_seconds())

Request Frequency

request_frequency = df.groupby('session_id').size() / session_durations

Unique Requests

unique_requests = df.groupby('session_id')['url'].nunique()

Request Sequence Analysis

df['request_sequence'] = df.groupby('session_id')['url'].transform(lambda x: ' '.join(x))

User Behavior Analysis

user_session_lengths = df.groupby('user')['session_id'].nunique() user_preferred_times = df.groupby(['user', 'hour']).size()

Aggregate Statistics

Session Frequency

sessions_per_user = df.groupby('user')['session_id'].nunique() sessions_per_day = df.groupby(df['timestamp'].dt.date)['session_id'].nunique()

Extracting "Cron mail sync failed" from custom_message and creating a new feature

def extract_cron_sync(message):

if "Cron mail sync failed" in message:
    return 1
else:
    return 0

df['cron_sync_failed'] = df['custom_message'].fillna('').apply(extract_cron_sync)

Extracting "Login failed" and "Configuration Error" from prev_message and creating new features

def extract_login_failed(prev_message):

if pd.notnull(prev_message) and "Login failed" in prev_message:
    return 1
else:
    return 0

def extract_configuration_error(prev_message):

if pd.notnull(prev_message) and "Configuration Error" in prev_message:
    return 1
else:
    return 0

df['login_failed'] = df['prev_message'].fillna('').apply(extract_login_failed) df['configuration_error'] = df['prev_message'].fillna('').apply(extract_configuration_error)

Step 3: Cleaning

print("Missing values before handling:") print(df.isnull().sum())

Calculate time since last request within each session

df['time_since_last_request'] = df.groupby('session_id')['timestamp'].diff().dt.total_seconds()

df['avg_request_frequency'] = df.groupby('session_id')['time_since_last_request'].transform('mean')

df['user_session_count'] = df['user'].map(user_session_lengths)

from scipy.stats import entropy df['user_entropy'] = df.groupby('user')['session_id'].transform(lambda x: entropy(x.value_counts(normalize=True)))

Temporal Features

Time of Day (Hour)

df['hour_of_day'] = df['timestamp'].dt.hour

Day of Week

df['day_of_week'] = df['timestamp'].dt.dayofweek

df['time_since_last_request'] = df.groupby('session_id')['timestamp'].diff().dt.total_seconds()

Interaction Features

df['UserSession_MessageLength_Product'] = df['user_session_count'] * df['message_length'] df['AvgRequestFrequency_per_MessageLength'] = df['avg_request_frequency'] / df['message_length']

Text Features (using Word2Vec embeddings)

Example code, you need to train Word2Vec model on your text data

text_corpus = df['preprocessed_message'].apply(str.split).tolist() word2vec_model = Word2Vec(text_corpus, vector_size=100, window=5, min_count=1, workers=4) word_vectors = word2vec_model.wv

Identify weekdays vs weekends

df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5

Identify peak hours (adjust according to typical usage)

df['is_peak_hours'] = df['hour'].isin(range(8, 18))

Session-based Features

Session Duration

df['session_duration'] = (df.groupby('session_id')['timestamp'].transform('max') - df.groupby('session_id')['timestamp'].transform('min')).dt.total_seconds()

Request Sequence Analysis

Add feature for the number of unique requests in each session

df['unique_request_count'] = df.groupby('session_id')['url'].transform('nunique')

Add feature for the most frequent request in each session

df['most_frequent_request'] = df.groupby('session_id')['url'].transform(lambda x: x.mode()[0])

Add feature for the longest request path in each session

df['longest_request_path'] = df.groupby('session_id')['url'].transform(lambda x: x.apply(len).max())

Add feature for the average request length in each session

df['average_request_length'] = df.groupby('session_id')['url'].transform(lambda x: x.apply(len).mean())

Add feature for the entropy of request sequence in each session

df['request_sequence_entropy'] = df.groupby('session_id')['url'].transform(lambda x: entropy(x.value_counts(normalize=True)))

User Behavior Analysis

Add feature for the number of sessions per user per day

sessions_per_user_per_day = df.groupby(['user', df['timestamp'].dt.date])['session_id'].nunique() df['sessions_per_user_per_day'] = df.apply(lambda row: sessions_per_user_per_day.get((row['user'], row['timestamp'].date()), 0), axis=1)

Add feature for the average session duration per user

average_session_duration_per_user = df.groupby('user')['session_duration'].mean() df['average_session_duration_per_user'] = df['user'].map(average_session_duration_per_user) df['session_activity'] = df.groupby('session_id')['url'].transform('count')

Add feature for the average session activity per user

average_session_activity_per_user = df.groupby('user')['session_activity'].mean() df['average_session_activity_per_user'] = df['user'].map(average_session_activity_per_user)

df['session_frequency'] = df.groupby('session_id')['time_since_last_request'].transform('mean')

Add feature for the average session frequency per user

average_session_frequency_per_user = df.groupby('user')['session_frequency'].mean() df['average_session_frequency_per_user'] = df['user'].map(average_session_frequency_per_user)

User Behavior Analysis

User Segmentation

kmeans_user = KMeans(n_clusters=3) df['user_segment'] = kmeans_user.fit_predict(df[['session_duration', 'session_activity']])

User Preferences

user_preferences = df.groupby('user')[['session_duration', 'session_activity']].mean()

Additional Features

User Engagement Metrics

user_engagement = df.groupby('user')['session_duration'].sum() / df.groupby('user')['session_activity'].sum() df['user_engagement'] = df['user'].map(user_engagement)

Standardize numerical features

scaler = StandardScaler() numerical_features = [ 'session_duration', 'session_activity', 'user_engagement'] df[numerical_features] = scaler.fit_transform(df[numerical_features])

Session-based Features

df['session_duration_activity'] = df['session_duration'] * df['session_activity'] df['session_frequency_duration'] = df['session_frequency'] * df['session_duration']

Combine user segments with session activity

df['user_segment_activity'] = df['user_segment'] * df['session_activity']

Combine user engagement metrics with session duration

df['user_engagement_duration'] = df['user_engagement'] * df['session_duration']

Example function to extract IMAP failure

def extract_imap_failure(message):

return 1 if "Error connecting to mail server" in message else 0

Manual labeling based on 'failed_attempt'

df['Anomaly'] = df['level'] >= 4

import numpy as np

Replace inf/-inf with NaN

df = df.replace([np.inf, -np.inf], np.nan)

Check for NaN values in the dataset

print(df.isna().sum())

Step 1: Identify different types of features

categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist() numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist() boolean_features = [col for col in numerical_features if set(df[col].dropna().unique()).issubset({0, 1})] numerical_features = [col for col in numerical_features if col not in boolean_features]

datetime_features = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()

Step 2: Impute missing values

Impute missing values for categorical features

for feature in categorical_features:

df[feature] = df[feature].fillna(df[feature].mode()[0])

Impute missing values for numerical features

for column in numerical_features:

df[column] = df[column].fillna(df[column].median())

Handling missing values in datetime columns

for feature in datetime_features:

df[feature] = df[feature].fillna(method='ffill')

Handling missing values in boolean columns

for column in boolean_features:

df[column] = df[column].fillna(0)

Convert boolean columns to integers

for column in boolean_features:

df[column] = df[column].astype(int)

print("Missing values after handling:") print(df.isnull().sum())

columns_to_drop = ['userAgent', 'version','reqId','line_number','day','hour','session_id','hour_of_day','day_of_week'

               ,'timestamp','file_path','app','user_agent','url','file_extension','level','user','trace' ,'method'
               ,'level_category','preprocessed_message','login_failed','configuration_error','is_weekend'
               ,'is_peak_hours','user_entropy','accessed_resource','user_session_count','session_duration'
               ,'time_since_last_request','simple_exception_type','most_frequent_request','unique_request_count'
               ,'prev_code','exception_code','exception_type' ,'exception_message'  ,'custom_message' ,'prev_exception' ,'prev_message' ,'prev_trace']

df = df.drop(columns=columns_to_drop, errors='ignore')

print("DataFrame after dropping columns:") print(df.info())

print(df.isnull().sum()) print("DataFrame size:", df.shape)

#Feature Selection and Dimensionality Reduction with PCA import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.metrics import classification_report, roc_auc_score, roc_curve import matplotlib.pyplot as plt

numerical_features = df.select_dtypes(include=[np.number]).columns numerical_features = [col for col in numerical_features if col not in ['Anomaly']]

import seaborn as sns import matplotlib.pyplot as plt

Create a correlation matrix

correlation_matrix = df[numerical_features].corr(method='pearson') # You can also use 'spearman' if non-linear relationships are expected

Visualize the correlation matrix as a heatmap

plt.figure(figsize=(12, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title('Feature Correlation Matrix') plt.show()

Feature-Anomaly Correlation

feature_anomaly_corr = df[numerical_features].apply(lambda x: x.corr(df['Anomaly'], method='pearson'))

print("Correlation with 'Anomaly':") print(feature_anomaly_corr)

Impute missing values for numerical features

imputer = SimpleImputer(strategy='mean') df_imputed = imputer.fit_transform(df[numerical_features])

Standardize the features

df_standardized = StandardScaler().fit_transform(df_imputed)

import matplotlib.pyplot as plt

pca = PCA().fit(df_standardized) # Assuming df_standardized is your feature matrix

Plot the cumulative sum of the explained variance ratio

plt.figure(figsize=(8, 5)) plt.plot(np.cumsum(pca.explained_varianceratio)) plt.xlabel('Number of Components') plt.ylabel('Cumulative Explained Variance') plt.title('Explained Variance by Components') plt.grid(True) plt.show()

Apply PCA with the selected number of components

pca = PCA(n_components=20) pca_result = pca.fit_transform(df_standardized)

Getting PCA loadings

feature_names = df[numerical_features].columns pca_loadingsdf = pd.DataFrame(pca.components.T, index=feature_names, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])

print("PCA Loadings:") print(pca_loadings_df)

from keras.models import Sequential from keras.layers import LSTM, Dense

X_train, X_test, y_train, y_test = train_test_split(pca_result, df['Anomaly'], test_size=0.2, random_state=42)

Import necessary libraries

from imblearn.over_sampling import SMOTE

Apply SMOTE to training data

smote = SMOTE(k_neighbors=10) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

Isolation Forest for Anomaly Detection

iso_forest = IsolationForest(n_estimators=50, contamination=0.05, random_state=42) iso_forest.fit(X_train_resampled)

y_pred_iso = iso_forest.predict(X_test)

accuracy_iso = accuracy_score(y_test, (y_pred_iso == -1).astype(int)) print("Isolation Forest Accuracy:", accuracy_iso)

from sklearn.inspection import permutation_importance

Assuming iso_forest is your trained Isolation Forest model and X_test is your test dataset

result = permutation_importance(iso_forest, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')

Get the importance of each feature

feature_importances = result.importances_mean

Display the feature importances

for feature, importance in zip(feature_names, feature_importances):

print(f"{feature}: {importance}")

LSTM for Sequence Data

X_train_lstm = np.array(X_train_resampled).reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1])) X_test_lstm = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.regularizers import l1_l2

model = Sequential() model.add(LSTM(10, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),

           kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)))  # Increased units, added L2 regularization

model.add(Dropout(0.2)) # Adding dropout model.add(LSTM(5, kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001))) model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=64, validation_split=0.25) # Adjusted epochs, batch size, and validation split

Make predictions

y_pred_lstm_prob = model.predict(X_test_lstm) y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int)

Predict anomalies using Isolation Forest

y_pred_iso = iso_forest.predict(X_test)

Convert predictions to binary labels

y_pred_iso_binary = (y_pred_iso == -1).astype(int)

Combine predictions

y_pred_combined = np.logical_or(y_pred_lstm.flatten(), y_pred_iso_binary)

Evaluation Metrics

print("Combined Performance:") print(classification_report(y_test, y_pred_combined)) print("ROC AUC score:", roc_auc_score(y_test, y_pred_combined))

Plot ROC Curve

fpr, tpr, _ = roc_curve(y_test, y_pred_combined)

plt.figure(figsize=(10, 6)) plt.plot(fpr, tpr, label='Combined ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve for Combined Model') plt.legend() plt.show()

import numpy as np import tensorflow as tf

Define a function to compute Integrated Gradients

def integrated_gradients(inputs, model, baseline=None, num_steps=50):

if baseline is None:
    baseline = np.zeros_like(inputs)

# Create a linear path from baseline to the inputs
alphas = np.linspace(0, 1, num_steps)

# Initialize an empty list to store integrated gradients
integrated_grads = []

# Convert inputs and baseline to TensorFlow tensors
inputs_tf = tf.convert_to_tensor(inputs, dtype=tf.float32)
baseline_tf = tf.convert_to_tensor(baseline, dtype=tf.float32)

# Compute gradients and integrate along the path
for alpha in alphas:
    interpolated_input = baseline_tf + alpha * (inputs_tf - baseline_tf)
    with tf.GradientTape() as tape:
        tape.watch(interpolated_input)
        predictions = model(interpolated_input)
    grads = tape.gradient(predictions, interpolated_input)
    integrated_grads.append(grads)

# Approximate the integral using the trapezoidal rule
integrated_grads = np.array(integrated_grads)
avg_grads = np.mean(integrated_grads, axis=0)
integrated_gradients = (inputs_tf - baseline_tf) * avg_grads
return integrated_gradients

Assuming X_test_lstm contains your test data

Compute Integrated Gradients for a single sample

sample_idx = 0 # Choose the index of the sample you want to explain sample = X_test_lstm[sample_idx:sample_idx+1] baseline = np.zeros_like(sample) # You can choose a different baseline if needed ig = integrated_gradients(sample, model, baseline=baseline)

Visualize the impact of features on the prediction

feature_names = numerical_features # Use your feature names here

Check the lengths of feature_names and the feature importances array

print("Length of feature_names:", len(feature_names)) print("Length of feature importances:", len(np.abs(ig[0].numpy().mean(axis=0))))

plt.figure(figsize=(10, 6)) plt.bar(feature_names, np.abs(ig[0].numpy().mean(axis=0))) plt.title('Integrated Gradients - Feature Importance') plt.xlabel('Features') plt.ylabel('Integrated Gradients') plt.xticks(rotation=45, ha='right') plt.show()

Assuming you have computed permutation importances and Integrated Gradients

from your Isolation Forest and LSTM model, respectively

Normalize permutation importances

if_max, if_min = max(feature_importances), min(feature_importances) normalized_if_importances = (feature_importances - if_min) / (if_max - if_min)

Normalize Integrated Gradients

ig_mean = np.mean(np.abs(ig[0].numpy()), axis=0) ig_max, ig_min = max(ig_mean), min(ig_mean) normalized_ig = (ig_mean - ig_min) / (ig_max - ig_min)

Combine importances

combined_importances = normalized_if_importances + normalized_ig

Visualize the combined feature importances

plt.figure(figsize=(10, 6)) plt.bar(feature_names, combined_importances) plt.title('Combined Feature Importances') plt.xlabel('Features') plt.ylabel('Combined Importances') plt.xticks(rotation=45, ha='right') plt.show()

Zip feature names and combined importances together

importance_pairs = zip(feature_names, combined_importances)

Print feature names and importances

for feature, importance in importance_pairs:

print(f'{feature}: {importance:.2f}')

Welcome to Wiki!