# Transformer Layers

Given an input X, a (T, d) dimensional input write a function using numpy to feed it to an encoder and decoder layer of the transformer. You will also have to implement the internal workings of scaled dot-product Attention and Multi-head attention layers. For convenience consider the projection dimension to be the same (dim_size=d) for all Query, Key and Value. Only The forward propagation is expected to be implemented which will be executed by the forward method of each class.
<center>
<img src="./fig/Transformer.png" width="324" height="470">
</center>
<br>
<center>
<img src="./fig/Attention-layers.png" width="550" height="350">
</center>


In [1]:
import numpy as np

In [2]:
class ScaledDotProductAttention:
    def __init__(self, dim_size):
        self.d_k = dim_size
        
    def forward(self, Q, K, V, Mask=None):
        attn_logits = np.einsum("lhnk,lhmk->lhnm",Q,K)/np.sqrt(self.d_k)              # batch x head x seq x seq
        if Mask is not None:
            attn_logits[Mask==1] = -9e15
        
        ## Softmax
        attention = np.exp(attn_logits)/np.sum(np.exp(attn_logits),axis =-1,keepdims=True)
        
        ## Matmul -> (batch,head,seq,seq)x(batch,head,seq,head_size) -> (batch,head,seq,head_size)
        values = np.einsum("lhmn,lhmq->lhmq",attention,V)                             # batch x head x seq x head_size       
        return values

In [3]:
class MultiHeadAttention:
    def __init__(self, dim_size, num_heads):
        # declare the scaled dot product attention layers here
        self.heads = num_heads
        self.dim_size = dim_size
        self.head_size = dim_size//num_heads                                          # head_size = dim_size//n_heads
        self.scaled_dot_product =ScaledDotProductAttention(self.head_size)
        scale = 1/np.sqrt(self.dim_size)
        self.WQ = np.random.uniform(-scale,scale,(self.dim_size,self.dim_size))       # dim_size x dim_size
        self.WK = np.random.uniform(-scale,scale,(self.dim_size,self.dim_size))       # dim_size x dim_size
        self.WV = np.random.uniform(-scale,scale,(self.dim_size,self.dim_size))       # dim_size x dim_size
        self.WO = np.random.uniform(-scale,scale,(self.dim_size,self.dim_size))       # dim_size x dim_size

    def forward(self,  Q, K, V, Mask=None):
        # returns the forward propagation output
        # Q,K,V                                                                       # batch x seq x dim_size
        Q = np.dot(Q,self.WQ)                                                         # batch x seq x dim_size
        Q = Q.reshape(Q.shape[0],Q.shape[1],self.heads,self.head_size)                # batch x seq x heads x head_size
        Q = np.transpose(Q,(0,2,1,3))                                                 # batch x heads x seq x head_size
        
        K = np.dot(K,self.WK).reshape(K.shape[0],K.shape[1],self.heads,self.dim_size) # batch x seq x heads x head_size
        K = np.transpose(K,(0,2,1,3))                                                 # batch x heads x seq x head_size
        
        V = np.dot(V,self.WV).reshape(V.shape[0],V.shape[1],self.heads,self.dim_size) # batch x seq x heads x head_size
        V = np.transpose(V,(0,2,1,3))                                                 # batch x heads x seq x head_size
        
        values = self.scaled_dot_product(Q,K,V,Mask)                                  # batch x heads x seq x head_size
        values = np.transpose(values,(0,2,1,3))                                       # batch x seq x heads x head_size
        values = values.reshape(value.shape[0],values.shape[1],self.emb_size)         # batch x seq x dim_size
        out = np.dot(values,self.WO)                                                  # batch x seq x dim_size
        return out

In [4]:
class Linear:
    def __init__(self,in_features,out_features):
        scale = 1/np.sqrt(in_features)
        self.W = np.random.uniform(-scale,scale,(in_features,out_features))
        
    def forward(self,x):
        return np.dot(x,self.W)
    
class Relu:
    def __init__(self):
        pass
    def forward(self,x):
        return np.maximum(0,x)

In [11]:
class Encoder:
    def __init__(self, dim_size, num_heads):
        super(Encoder, self).__init__()
        # declare your MHA layer
        # weights for Feedforward layer
        self.self_attn = MultiHeadAttention(dim_size,num_heads)
        self.linear1 = Linear(in_features=dim_size,out_features=dim_size*2)
        self.activation = Relu()
        self.linear2 = Linear(in_features = dim_size*2, out_features = dim_size)
        
    def forward(self, X):
        ## MHA
        attn_out = self.self_attn(X,X,X,None)
        
        ## Add
        X = X + attn_out
        
        ## Feedforward
        x1 = self.linear1(X)
        x1 = self.activation(x1)
        x1 = self.linear1(x1)
        
        ## Add
        X = X + x1
        
        return X        

In [6]:
class Decoder:
    def __init__(self, dim_size, num_heads1, num_heads2):
        # declare your MHA layer
        # weights for Feedforward layer
        self.self_masked_attn = MultiHeadAttention(dim_size,num_heads1)
        
        self.self_attn = MultiHeadAttention(dim_size,num_heads2)
        self.linear1 = Linear(in_features=dim_size,out_features=dim_size*2)
        self.activation = Relu()
        self.linear2 = Linear(in_features = dim_size*2, out_features = dim_size)
        
    def forward(self, X, encoder_input, mask):
        ## Masked MHA
        mask_attn_out = self.self_masked_attn(X,X,X,mask)
        X = X + mask_attn_out 
        
        ## MHA
        attn_out = self.self_attn(encoder_input,encoder_input,X,None)
        
        ## Add
        X = X + attn_out
        
        ## Feedforward
        x1 = self.linear1(X)
        x1 = self.activation(x1)
        x1 = self.linear1(x1)
        
        ## Add
        X = X + x1
        
        return X        