Upload 6 files
Browse files- .gitattributes +1 -0
- labelData.csv +10 -0
- malware_classifier_lime.h5 +3 -0
- model_training.py +131 -0
- predict.py +61 -0
- spectrum.tar +3 -0
- validate.py +50 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
spectrum.tar filter=lfs diff=lfs merge=lfs -text
|
labelData.csv
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Login successful for user admin,0
|
2 |
+
User admin added a new user john_doe,0
|
3 |
+
Failed login attempt from IP 192.168.1.55,1
|
4 |
+
User john_doe deleted file important_document.txt,0
|
5 |
+
New user jane_doe registered,0
|
6 |
+
Unauthorized access from IP 10.0.0.27,1
|
7 |
+
User admin updated file financial_report.xlsx,0
|
8 |
+
Failed login attempt from IP 192.168.1.56,1
|
9 |
+
User jane_doe uploaded file meeting_notes.docx,0
|
10 |
+
Unauthorized access from IP 10.0.0.28,1
|
malware_classifier_lime.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b23d9ceac1ec5ac69287c9a597d51586a017dcc525aede8a6b0953ba0597ca56
|
3 |
+
size 53156780
|
model_training.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#the goal of this script is to train the model and then save it
|
2 |
+
import os
|
3 |
+
import tensorflow as tf
|
4 |
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
5 |
+
from tensorflow.keras.models import Sequential
|
6 |
+
from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, Flatten, Dense, Dropout
|
7 |
+
from tensorflow.keras.optimizers import Adam
|
8 |
+
from tensorflow.keras.models import load_model
|
9 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
10 |
+
import shutil
|
11 |
+
from PIL import Image
|
12 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import cv2
|
15 |
+
#import seaborn as sns
|
16 |
+
import numpy as np
|
17 |
+
import pickle
|
18 |
+
|
19 |
+
def clean_directory(directory, cache_file="cache.pkl"):
|
20 |
+
if os.path.exists(cache_file):
|
21 |
+
with open(cache_file, "rb") as f:
|
22 |
+
num_classes = pickle.load(f)
|
23 |
+
print("Loaded cached results.")
|
24 |
+
return num_classes
|
25 |
+
|
26 |
+
num_classes = 0
|
27 |
+
for subdir, dirs, files in os.walk(directory):
|
28 |
+
if not dirs:
|
29 |
+
num_classes += 1
|
30 |
+
valid_files = []
|
31 |
+
for file in files:
|
32 |
+
file_path = os.path.join(subdir, file)
|
33 |
+
try:
|
34 |
+
img = Image.open(file_path)
|
35 |
+
img.verify() # Verify if the image is not corrupted
|
36 |
+
valid_files.append(file)
|
37 |
+
except (IOError, SyntaxError) as e:
|
38 |
+
print(f"Removing corrupted file: {file_path}")
|
39 |
+
os.remove(file_path)
|
40 |
+
|
41 |
+
# Remove empty directories
|
42 |
+
if not valid_files:
|
43 |
+
print(f"Removing empty directory: {subdir}")
|
44 |
+
shutil.rmtree(subdir)
|
45 |
+
num_classes -= 1
|
46 |
+
|
47 |
+
# Save the results in cache
|
48 |
+
with open(cache_file, "wb") as f:
|
49 |
+
pickle.dump(num_classes, f)
|
50 |
+
print("Saved results to cache.")
|
51 |
+
|
52 |
+
return num_classes
|
53 |
+
|
54 |
+
data_dir = 'Malign/extract'
|
55 |
+
|
56 |
+
num_classes = clean_directory(data_dir)
|
57 |
+
|
58 |
+
# Parameters
|
59 |
+
batch_size = 32
|
60 |
+
epochs = 50
|
61 |
+
image_size = (200, 200) # Set the desired image size for input to the model
|
62 |
+
# Data preprocessing
|
63 |
+
train_datagen = ImageDataGenerator(
|
64 |
+
rescale=1./255,
|
65 |
+
validation_split=0.2 # Split 20% of data for validation
|
66 |
+
)
|
67 |
+
|
68 |
+
train_generator = train_datagen.flow_from_directory(
|
69 |
+
data_dir,
|
70 |
+
target_size=image_size,
|
71 |
+
batch_size=batch_size,
|
72 |
+
class_mode='categorical',
|
73 |
+
subset='training'
|
74 |
+
)
|
75 |
+
|
76 |
+
validation_generator = train_datagen.flow_from_directory(
|
77 |
+
data_dir,
|
78 |
+
target_size=image_size,
|
79 |
+
batch_size=batch_size,
|
80 |
+
class_mode='categorical',
|
81 |
+
subset='validation'
|
82 |
+
)
|
83 |
+
|
84 |
+
# Model creation
|
85 |
+
model = Sequential()
|
86 |
+
|
87 |
+
# First convolution layer
|
88 |
+
model.add(Conv2D(64, (3, 3), input_shape=(*image_size, 3)))
|
89 |
+
model.add(Activation('relu'))
|
90 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
91 |
+
|
92 |
+
# Second convolution layer
|
93 |
+
model.add(Conv2D(64, (3, 3)))
|
94 |
+
model.add(Activation('relu'))
|
95 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
96 |
+
|
97 |
+
# Third convolution layer
|
98 |
+
model.add(Conv2D(64, (3, 3)))
|
99 |
+
model.add(Activation('relu'))
|
100 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
101 |
+
|
102 |
+
# Fully connected layers
|
103 |
+
model.add(Flatten())
|
104 |
+
model.add(Dense(128))
|
105 |
+
model.add(Dropout(0.5))
|
106 |
+
model.add(Activation('relu'))
|
107 |
+
|
108 |
+
# Output layer
|
109 |
+
model.add(Dense(119))
|
110 |
+
model.add(Activation('softmax'))
|
111 |
+
|
112 |
+
model.summary()
|
113 |
+
|
114 |
+
model.compile(
|
115 |
+
optimizer=Adam(learning_rate=0.001),
|
116 |
+
loss='categorical_crossentropy',
|
117 |
+
metrics=['accuracy']
|
118 |
+
)
|
119 |
+
|
120 |
+
# Model training
|
121 |
+
history = model.fit(
|
122 |
+
train_generator,
|
123 |
+
epochs=epochs,
|
124 |
+
validation_data=validation_generator
|
125 |
+
)
|
126 |
+
|
127 |
+
# Save the trained model
|
128 |
+
model.save("malware_classifier_lime.h5")
|
129 |
+
|
130 |
+
|
131 |
+
|
predict.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
from tensorflow.keras.models import load_model
|
5 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from lime import lime_image
|
8 |
+
from skimage.segmentation import mark_boundaries
|
9 |
+
|
10 |
+
def explain_instance(image_path, model, num_features=5, num_samples=1000):
|
11 |
+
img = load_img(image_path, target_size=image_size)
|
12 |
+
img_array = img_to_array(img) / 255
|
13 |
+
explanation = explainer.explain_instance(img_array, model.predict, top_labels=num_classes, hide_color=0,
|
14 |
+
num_samples=num_samples, num_features=num_features)
|
15 |
+
return explanation
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
if len(sys.argv) != 2:
|
19 |
+
print("Usage: predict.py image_path")
|
20 |
+
sys.exit(1)
|
21 |
+
|
22 |
+
image_path = sys.argv[1]
|
23 |
+
image_size = (200, 200)
|
24 |
+
model_path = "malware_classifier_lime.h5"
|
25 |
+
model = load_model(model_path)
|
26 |
+
num_classes = 119
|
27 |
+
|
28 |
+
explainer = lime_image.LimeImageExplainer()
|
29 |
+
explanation = explain_instance(image_path, model)
|
30 |
+
|
31 |
+
temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)
|
32 |
+
img = load_img(image_path, target_size=image_size)
|
33 |
+
img_array = img_to_array(img) / 255
|
34 |
+
|
35 |
+
# Display the original image
|
36 |
+
plt.figure(figsize=(10, 5))
|
37 |
+
plt.subplot(1, 2, 1)
|
38 |
+
plt.imshow(img_array)
|
39 |
+
plt.title("Original Image")
|
40 |
+
plt.axis("off")
|
41 |
+
|
42 |
+
# Display the LIME explanation
|
43 |
+
plt.subplot(1, 2, 2)
|
44 |
+
plt.imshow(mark_boundaries(temp, mask))
|
45 |
+
plt.title("LIME Explanation")
|
46 |
+
plt.axis("off")
|
47 |
+
|
48 |
+
plt.show()
|
49 |
+
|
50 |
+
# Make a prediction
|
51 |
+
img = load_img(image_path, target_size=image_size)
|
52 |
+
img_array = img_to_array(img) / 255
|
53 |
+
img_array = np.expand_dims(img_array, axis=0)
|
54 |
+
|
55 |
+
prediction = model.predict(img_array)
|
56 |
+
predicted_class = np.argmax(prediction)
|
57 |
+
|
58 |
+
# Get the class name
|
59 |
+
class_name = list(train_generator.class_indices.keys())[list(train_generator.class_indices.values()).index(predicted_class)]
|
60 |
+
|
61 |
+
print(f"Predicted class: {predicted_class}, Class name: {class_name}")
|
spectrum.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:485d96d9d396c57cde7433181d3645ebe5f87155972921a50ee101f9882a515d
|
3 |
+
size 542136320
|
validate.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
5 |
+
from tensorflow.keras.models import load_model
|
6 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
7 |
+
import pickle
|
8 |
+
|
9 |
+
# Load the saved model
|
10 |
+
model = load_model("malware_classifier_lime.h5")
|
11 |
+
|
12 |
+
data_dir = 'Malign/extract'
|
13 |
+
|
14 |
+
# Load the number of classes from the cache file
|
15 |
+
with open("cache.pkl", "rb") as f:
|
16 |
+
num_classes = pickle.load(f)
|
17 |
+
|
18 |
+
# Parameters
|
19 |
+
batch_size = 32
|
20 |
+
image_size = (200, 200)
|
21 |
+
|
22 |
+
# Data preprocessing
|
23 |
+
test_datagen = ImageDataGenerator(rescale=1./255)
|
24 |
+
|
25 |
+
test_generator = test_datagen.flow_from_directory(
|
26 |
+
data_dir,
|
27 |
+
target_size=image_size,
|
28 |
+
batch_size=batch_size,
|
29 |
+
class_mode='categorical',
|
30 |
+
shuffle=False
|
31 |
+
)
|
32 |
+
|
33 |
+
# Evaluate the model
|
34 |
+
print("Evaluating the model...")
|
35 |
+
score = model.evaluate(test_generator)
|
36 |
+
print("Loss: ", score[0])
|
37 |
+
print("Accuracy: ", score[1])
|
38 |
+
|
39 |
+
# Predict the class labels
|
40 |
+
print("Predicting the class labels...")
|
41 |
+
y_pred = model.predict(test_generator)
|
42 |
+
y_pred_classes = np.argmax(y_pred, axis=1)
|
43 |
+
|
44 |
+
# Classification report
|
45 |
+
print("Classification report:")
|
46 |
+
print(classification_report(test_generator.classes, y_pred_classes, target_names=test_generator.class_indices.keys()))
|
47 |
+
|
48 |
+
# Confusion matrix
|
49 |
+
print("Confusion matrix:")
|
50 |
+
print(confusion_matrix(test_generator.classes, y_pred_classes))
|