nilekhet commited on
Commit
b743670
·
1 Parent(s): 081f8b3

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. labelData.csv +10 -0
  3. malware_classifier_lime.h5 +3 -0
  4. model_training.py +131 -0
  5. predict.py +61 -0
  6. spectrum.tar +3 -0
  7. validate.py +50 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ spectrum.tar filter=lfs diff=lfs merge=lfs -text
labelData.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Login successful for user admin,0
2
+ User admin added a new user john_doe,0
3
+ Failed login attempt from IP 192.168.1.55,1
4
+ User john_doe deleted file important_document.txt,0
5
+ New user jane_doe registered,0
6
+ Unauthorized access from IP 10.0.0.27,1
7
+ User admin updated file financial_report.xlsx,0
8
+ Failed login attempt from IP 192.168.1.56,1
9
+ User jane_doe uploaded file meeting_notes.docx,0
10
+ Unauthorized access from IP 10.0.0.28,1
malware_classifier_lime.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23d9ceac1ec5ac69287c9a597d51586a017dcc525aede8a6b0953ba0597ca56
3
+ size 53156780
model_training.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #the goal of this script is to train the model and then save it
2
+ import os
3
+ import tensorflow as tf
4
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
5
+ from tensorflow.keras.models import Sequential
6
+ from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, Flatten, Dense, Dropout
7
+ from tensorflow.keras.optimizers import Adam
8
+ from tensorflow.keras.models import load_model
9
+ from tensorflow.keras.preprocessing.image import load_img, img_to_array
10
+ import shutil
11
+ from PIL import Image
12
+ from tensorflow.keras.preprocessing.image import load_img, img_to_array
13
+ import matplotlib.pyplot as plt
14
+ import cv2
15
+ #import seaborn as sns
16
+ import numpy as np
17
+ import pickle
18
+
19
+ def clean_directory(directory, cache_file="cache.pkl"):
20
+ if os.path.exists(cache_file):
21
+ with open(cache_file, "rb") as f:
22
+ num_classes = pickle.load(f)
23
+ print("Loaded cached results.")
24
+ return num_classes
25
+
26
+ num_classes = 0
27
+ for subdir, dirs, files in os.walk(directory):
28
+ if not dirs:
29
+ num_classes += 1
30
+ valid_files = []
31
+ for file in files:
32
+ file_path = os.path.join(subdir, file)
33
+ try:
34
+ img = Image.open(file_path)
35
+ img.verify() # Verify if the image is not corrupted
36
+ valid_files.append(file)
37
+ except (IOError, SyntaxError) as e:
38
+ print(f"Removing corrupted file: {file_path}")
39
+ os.remove(file_path)
40
+
41
+ # Remove empty directories
42
+ if not valid_files:
43
+ print(f"Removing empty directory: {subdir}")
44
+ shutil.rmtree(subdir)
45
+ num_classes -= 1
46
+
47
+ # Save the results in cache
48
+ with open(cache_file, "wb") as f:
49
+ pickle.dump(num_classes, f)
50
+ print("Saved results to cache.")
51
+
52
+ return num_classes
53
+
54
+ data_dir = 'Malign/extract'
55
+
56
+ num_classes = clean_directory(data_dir)
57
+
58
+ # Parameters
59
+ batch_size = 32
60
+ epochs = 50
61
+ image_size = (200, 200) # Set the desired image size for input to the model
62
+ # Data preprocessing
63
+ train_datagen = ImageDataGenerator(
64
+ rescale=1./255,
65
+ validation_split=0.2 # Split 20% of data for validation
66
+ )
67
+
68
+ train_generator = train_datagen.flow_from_directory(
69
+ data_dir,
70
+ target_size=image_size,
71
+ batch_size=batch_size,
72
+ class_mode='categorical',
73
+ subset='training'
74
+ )
75
+
76
+ validation_generator = train_datagen.flow_from_directory(
77
+ data_dir,
78
+ target_size=image_size,
79
+ batch_size=batch_size,
80
+ class_mode='categorical',
81
+ subset='validation'
82
+ )
83
+
84
+ # Model creation
85
+ model = Sequential()
86
+
87
+ # First convolution layer
88
+ model.add(Conv2D(64, (3, 3), input_shape=(*image_size, 3)))
89
+ model.add(Activation('relu'))
90
+ model.add(MaxPooling2D(pool_size=(2, 2)))
91
+
92
+ # Second convolution layer
93
+ model.add(Conv2D(64, (3, 3)))
94
+ model.add(Activation('relu'))
95
+ model.add(MaxPooling2D(pool_size=(2, 2)))
96
+
97
+ # Third convolution layer
98
+ model.add(Conv2D(64, (3, 3)))
99
+ model.add(Activation('relu'))
100
+ model.add(MaxPooling2D(pool_size=(2, 2)))
101
+
102
+ # Fully connected layers
103
+ model.add(Flatten())
104
+ model.add(Dense(128))
105
+ model.add(Dropout(0.5))
106
+ model.add(Activation('relu'))
107
+
108
+ # Output layer
109
+ model.add(Dense(119))
110
+ model.add(Activation('softmax'))
111
+
112
+ model.summary()
113
+
114
+ model.compile(
115
+ optimizer=Adam(learning_rate=0.001),
116
+ loss='categorical_crossentropy',
117
+ metrics=['accuracy']
118
+ )
119
+
120
+ # Model training
121
+ history = model.fit(
122
+ train_generator,
123
+ epochs=epochs,
124
+ validation_data=validation_generator
125
+ )
126
+
127
+ # Save the trained model
128
+ model.save("malware_classifier_lime.h5")
129
+
130
+
131
+
predict.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ from tensorflow.keras.models import load_model
5
+ from tensorflow.keras.preprocessing.image import load_img, img_to_array
6
+ import matplotlib.pyplot as plt
7
+ from lime import lime_image
8
+ from skimage.segmentation import mark_boundaries
9
+
10
+ def explain_instance(image_path, model, num_features=5, num_samples=1000):
11
+ img = load_img(image_path, target_size=image_size)
12
+ img_array = img_to_array(img) / 255
13
+ explanation = explainer.explain_instance(img_array, model.predict, top_labels=num_classes, hide_color=0,
14
+ num_samples=num_samples, num_features=num_features)
15
+ return explanation
16
+
17
+ if __name__ == "__main__":
18
+ if len(sys.argv) != 2:
19
+ print("Usage: predict.py image_path")
20
+ sys.exit(1)
21
+
22
+ image_path = sys.argv[1]
23
+ image_size = (200, 200)
24
+ model_path = "malware_classifier_lime.h5"
25
+ model = load_model(model_path)
26
+ num_classes = 119
27
+
28
+ explainer = lime_image.LimeImageExplainer()
29
+ explanation = explain_instance(image_path, model)
30
+
31
+ temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)
32
+ img = load_img(image_path, target_size=image_size)
33
+ img_array = img_to_array(img) / 255
34
+
35
+ # Display the original image
36
+ plt.figure(figsize=(10, 5))
37
+ plt.subplot(1, 2, 1)
38
+ plt.imshow(img_array)
39
+ plt.title("Original Image")
40
+ plt.axis("off")
41
+
42
+ # Display the LIME explanation
43
+ plt.subplot(1, 2, 2)
44
+ plt.imshow(mark_boundaries(temp, mask))
45
+ plt.title("LIME Explanation")
46
+ plt.axis("off")
47
+
48
+ plt.show()
49
+
50
+ # Make a prediction
51
+ img = load_img(image_path, target_size=image_size)
52
+ img_array = img_to_array(img) / 255
53
+ img_array = np.expand_dims(img_array, axis=0)
54
+
55
+ prediction = model.predict(img_array)
56
+ predicted_class = np.argmax(prediction)
57
+
58
+ # Get the class name
59
+ class_name = list(train_generator.class_indices.keys())[list(train_generator.class_indices.values()).index(predicted_class)]
60
+
61
+ print(f"Predicted class: {predicted_class}, Class name: {class_name}")
spectrum.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:485d96d9d396c57cde7433181d3645ebe5f87155972921a50ee101f9882a515d
3
+ size 542136320
validate.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
5
+ from tensorflow.keras.models import load_model
6
+ from sklearn.metrics import classification_report, confusion_matrix
7
+ import pickle
8
+
9
+ # Load the saved model
10
+ model = load_model("malware_classifier_lime.h5")
11
+
12
+ data_dir = 'Malign/extract'
13
+
14
+ # Load the number of classes from the cache file
15
+ with open("cache.pkl", "rb") as f:
16
+ num_classes = pickle.load(f)
17
+
18
+ # Parameters
19
+ batch_size = 32
20
+ image_size = (200, 200)
21
+
22
+ # Data preprocessing
23
+ test_datagen = ImageDataGenerator(rescale=1./255)
24
+
25
+ test_generator = test_datagen.flow_from_directory(
26
+ data_dir,
27
+ target_size=image_size,
28
+ batch_size=batch_size,
29
+ class_mode='categorical',
30
+ shuffle=False
31
+ )
32
+
33
+ # Evaluate the model
34
+ print("Evaluating the model...")
35
+ score = model.evaluate(test_generator)
36
+ print("Loss: ", score[0])
37
+ print("Accuracy: ", score[1])
38
+
39
+ # Predict the class labels
40
+ print("Predicting the class labels...")
41
+ y_pred = model.predict(test_generator)
42
+ y_pred_classes = np.argmax(y_pred, axis=1)
43
+
44
+ # Classification report
45
+ print("Classification report:")
46
+ print(classification_report(test_generator.classes, y_pred_classes, target_names=test_generator.class_indices.keys()))
47
+
48
+ # Confusion matrix
49
+ print("Confusion matrix:")
50
+ print(confusion_matrix(test_generator.classes, y_pred_classes))