Multifaceted-Digit-Recognizer/train_tesnorflow_sound.py at master · skshashankkumar41/Multifaceted-Digit-Recognizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from imutils import paths
import os
import numpy as np
import librosa
from tqdm import tqdm
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten,BatchNormalization
from keras.layers import Conv2D, MaxPooling2D

def plt_dynamic(x, vy, ty, ax, colors=['b']):
    ax.plot(x, vy, 'b', label="Validation Loss")
    ax.plot(x, ty, 'r', label="Train Loss")
    plt.legend()
    plt.grid()
    fig.canvas.draw()


files=list(paths.list_files('recordings'))

data=[]
for i in tqdm(files):
    y,sr=librosa.load(i,sr=8000,mono=True)
    mfcc=librosa.feature.mfcc(y,sr=8000, n_mfcc=40)
    if mfcc.shape[1] > 40:
        mfcc = mfcc[:, 0:40]
    else:
        mfcc = np.pad(mfcc, ((0, 0), (0, 40 - mfcc.shape[1])),
                               mode='constant', constant_values=0)
    data.append(mfcc)
data=np.array(data)
data = data.reshape((data.shape[0], 40, 40, 1))

labels=[]
for i in files:
    labels.append(i.split(os.path.sep)[1])
labels=np.array(labels)

labels=to_categorical(labels)

X_train,X_test,y_train,y_test=train_test_split(data,labels,test_size=0.2)

model = Sequential()
model.add(Conv2D(32, kernel_size=(2, 2),
                 activation='relu',
                 input_shape=(40,40,1),padding='same'))
model.add(Conv2D(48, (2, 2), activation='relu',padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(128, (2, 2), activation='relu',padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.40))
model.add(Dense(10, activation='softmax'))
print(model.summary())
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])

history=model.fit(X_train, y_train,
          batch_size=64,
          epochs=50,
          verbose=2,
          validation_data=(X_test, y_test))

score = model.evaluate(X_test, y_test, verbose=2)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

fig,ax = plt.subplots(1,1)
ax.set_xlabel('epoch') ; ax.set_ylabel('Categorical Crossentropy Loss')

x = list(range(1,50+1))

vy = history.history['val_loss']
ty = history.history['loss']
plt_dynamic(x, vy, ty, ax)

predictions = model.predict(X_test, batch_size=32)
print(classification_report(y_test.argmax(axis=1),predictions.argmax(axis=1)))

model.save('mnist_sound.h5')