In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch.nn.functional as F
import torch
from torch import nn
from torchsummary import summary
from importlib.util import find_spec
if find_spec("text_recognizer") is None:
    import sys
    sys.path.append('..')

In [2]:
from text_recognizer.networks import CNN, TDS2d

In [3]:
tds2d = TDS2d(**{
    "depth" : 4,
    "tds_groups" : [
      { "channels" : 4, "num_blocks" : 3, "stride" : [2, 2] },
      { "channels" : 32, "num_blocks" : 3, "stride" : [2, 2] },
      { "channels" : 64, "num_blocks" : 3, "stride" : [2, 2] },
      { "channels" : 128, "num_blocks" : 3, "stride" : [2, 1] },
    ],
    "kernel_size" : [5, 7],
    "dropout_rate" : 0.1
  }, input_dim=32, output_dim=128)

In [4]:
tds2d

TDS2d(
  (tds): Sequential(
    (0): Conv2d(1, 16, kernel_size=[5, 7], stride=[2, 2], padding=(2, 3))
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): InstanceNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
    (4): TDSBlock2d(
      (conv): Sequential(
        (0): Conv3d(4, 4, kernel_size=(1, 5, 7), stride=(1, 1, 1), padding=(0, 2, 3))
        (1): ReLU(inplace=True)
        (2): Dropout(p=0.1, inplace=False)
      )
      (mlp): Sequential(
        (0): Linear(in_features=16, out_features=16, bias=True)
        (1): ReLU(inplace=True)
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=16, out_features=16, bias=True)
        (4): Dropout(p=0.1, inplace=False)
      )
      (instance_norm): ModuleList(
        (0): InstanceNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
        (1): InstanceNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
      )
    )


In [5]:
summary(tds2d, (1, 28, 952), device="cpu", depth=3)

Layer (type:depth-idx)                        Output Shape              Param #
├─Sequential: 1-1                             [-1, 512, 2, 119]         --
|    └─Conv2d: 2-1                            [-1, 16, 14, 476]         576
|    └─ReLU: 2-2                              [-1, 16, 14, 476]         --
|    └─Dropout: 2-3                           [-1, 16, 14, 476]         --
|    └─InstanceNorm2d: 2-4                    [-1, 16, 14, 476]         32
|    └─TDSBlock2d: 2-5                        [-1, 16, 14, 476]         --
|    |    └─Sequential: 3-1                   [-1, 4, 4, 14, 476]       564
|    |    └─Sequential: 3-2                   [-1, 476, 14, 16]         544
|    └─TDSBlock2d: 2-6                        [-1, 16, 14, 476]         --
|    |    └─Sequential: 3-3                   [-1, 4, 4, 14, 476]       564
|    |    └─Sequential: 3-4                   [-1, 476, 14, 16]         544
|    └─TDSBlock2d: 2-7                        [-1, 16, 14, 476]         --
|    |    └─Seq

Layer (type:depth-idx)                        Output Shape              Param #
├─Sequential: 1-1                             [-1, 512, 2, 119]         --
|    └─Conv2d: 2-1                            [-1, 16, 14, 476]         576
|    └─ReLU: 2-2                              [-1, 16, 14, 476]         --
|    └─Dropout: 2-3                           [-1, 16, 14, 476]         --
|    └─InstanceNorm2d: 2-4                    [-1, 16, 14, 476]         32
|    └─TDSBlock2d: 2-5                        [-1, 16, 14, 476]         --
|    |    └─Sequential: 3-1                   [-1, 4, 4, 14, 476]       564
|    |    └─Sequential: 3-2                   [-1, 476, 14, 16]         544
|    └─TDSBlock2d: 2-6                        [-1, 16, 14, 476]         --
|    |    └─Sequential: 3-3                   [-1, 4, 4, 14, 476]       564
|    |    └─Sequential: 3-4                   [-1, 476, 14, 16]         544
|    └─TDSBlock2d: 2-7                        [-1, 16, 14, 476]         --
|    |    └─Seq

In [6]:
t = torch.randn(2,1, 28, 952)

In [7]:
tds2d(t).shape

torch.Size([2, 119, 128])

In [9]:
cnn = CNN().cuda()

In [None]:
i = nn.Sequential(nn.Conv2d(1,1,1,1))

In [None]:
nn.Sequential(i,i)

In [None]:
cnn(t).shape

In [None]:
from text_recognizer.networks.vqvae import Encoder, Decoder, VQVAE

In [None]:
vqvae = VQVAE(1, [32, 128, 128, 256], [4, 4, 4, 4], [2, 2, [1, 2], [1, 2]], 2, 32, 256, [[6, 119], [7, 238]])

In [None]:
t = torch.randn(2, 1, 28, 952)

In [None]:
x, l = vqvae(t)

In [None]:
5 * 59 / 10

In [None]:
x.shape

In [None]:
summary(vqvae, (1, 28, 952), device="cpu", depth=3)

In [None]:
up = nn.Upsample([4, 59])

In [None]:
up(tt).shape

In [None]:
tt.shape

In [None]:
class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.proj = nn.Linear(dim_in, dim_out * 2)

    def forward(self, x):
        x, gate = self.proj(x).chunk(2, dim = -1)
        return x * F.gelu(gate)

In [None]:
e = GEGLU(256, 2048)

In [None]:
e(t).shape

In [None]:
emb = nn.Embedding(56, 256)

In [None]:
with torch.no_grad():
    e = emb(torch.Tensor([55]).long())

In [None]:
from einops import repeat

In [None]:
ee = repeat(e, "() n -> b n", b=16)

In [None]:
emb.device

In [None]:
ee

In [None]:
ee.shape

In [None]:
t = torch.randn(16, 10, 256)

In [None]:
t.shape

In [None]:
t = torch.cat((ee.unsqueeze(1), t, ee.unsqueeze(1)), dim=1)

In [None]:
t.shape

In [None]:
e.shape

In [None]:
from text_recognizer.networks.residual_network import IdentityBlock, ResidualBlock, BasicBlock, BottleNeckBlock, ResidualLayer, ResidualNetwork, ResidualNetworkEncoder

In [None]:
from text_recognizer.networks import WideResidualNetwork

In [None]:
wr = WideResidualNetwork(
            in_channels= 1,
            num_classes= 80,
            in_planes=64,
            depth=10,
            num_layers=4,
            width_factor=2,
            num_stages=[64, 128, 256, 256],
            dropout_rate= 0.1,
            activation= "SELU",
            use_decoder= False,
)

In [None]:
from torchsummary import summary

In [None]:
backbone = ResidualNetworkEncoder(1, [64, 65, 66, 67, 68], [2, 2, 2, 2, 2])

In [None]:
summary(backbone, (1, 28, 952), device="cpu", depth=3)

In [None]:
        backbone = nn.Sequential(
            *list(wr.children())[:][:]
        )


In [None]:
backbone

In [None]:
summary(wr, (1, 28, 952), device="cpu", depth=3)

In [None]:
a = torch.rand(1, 1, 28, 952)

In [None]:
b = wr(a)

In [None]:
from einops import rearrange

In [None]:
b = rearrange(b, "b c h w -> b w c h")

In [None]:
c = nn.AdaptiveAvgPool2d((None, 1))

In [None]:
d = c(b)

In [None]:
d.shape

In [None]:
d.squeeze(3).shape

In [None]:
b.shape

In [None]:
from torch import nn

In [None]:
32 + 64

In [None]:
3 * 112

In [None]:
col_embed = nn.Parameter(torch.rand(1000, 256 // 2))

In [None]:
W, H = 196, 4

In [None]:
col_embed[:W].unsqueeze(0).repeat(H, 1, 1).shape

In [None]:
col_embed[:H].unsqueeze(1).repeat(1, W, 1).shape

In [None]:
           torch.cat(
                [
                    col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
                    col_embed[:H].unsqueeze(1).repeat(1, W, 1),
                ],
                dim=-1,
            ).unsqueeze(0).shape

In [None]:
4 * 196

In [None]:
target = torch.tensor([1,1,12,1,1,1,1,1,9,9,9,9,9,9])

In [None]:
torch.nonzero(target == 9, as_tuple=False)[0].item()

In [None]:
target[:9]

In [None]:
np.inf

In [None]:
from text_recognizer.networks.transformer.positional_encoding import PositionalEncoding

In [None]:
plt.figure(figsize=(15, 5))
pe = PositionalEncoding(20, 0)
y = pe.forward(torch.zeros(1, 100, 20))
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d"%p for p in [4,5,6,7]])
None

In [None]:
from text_recognizer.networks.densenet import DenseNet,_DenseLayer,_DenseBlock

In [None]:
dnet = DenseNet(12, (6, 12, 10), 1, 24, 80, 4, 0, True)

In [None]:
216 / 8

In [None]:
summary(dnet, (1, 28, 952), device="cpu", depth=3)

In [None]:
        backbone = nn.Sequential(
            *list(dnet.children())[:][:-4]
        )

In [None]:
backbone

In [None]:
from text_recognizer.networks import WideResidualNetwork

In [None]:
w = WideResidualNetwork(
        in_channels = 1,
        in_planes = 32,
        num_classes = 80,
        depth = 10,
        width_factor = 1,
        dropout_rate = 0.0,
        num_layers = 5,
        activation = "relu",
        use_decoder = False,)

In [None]:
summary(w, (1, 28, 952), device="cpu", depth=2)

In [None]:
sz= 5

In [None]:
mask = torch.triu(torch.ones(sz, sz), 1)
mask = mask.masked_fill(mask==1, float('-inf'))

In [None]:

h = torch.rand(1, 256, 10, 10)

In [None]:
h.flatten(2).permute(2, 0, 1).shape

In [None]:
h.flatten(2).permute(2, 0, 1).shape

In [None]:
mask


In [None]:
pred = torch.Tensor([1,21,2,45,31, 81, 1, 79, 79, 79, 2,1,1,1,1, 81, 1, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79]).long()
target = torch.Tensor([1,1,1,1,1, 81, 79, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79, 1,1,1,1,1, 81, 79, 79, 79, 79]).long()

In [None]:
mask = (target != 79)

In [None]:
mask

In [None]:
pred * mask

In [None]:
target * mask

In [None]:
from text_recognizer.models.metrics import accuracy

In [None]:
pad_indcies = torch.nonzero(target == 79, as_tuple=False)

In [None]:
t1 = torch.nonzero(target == 81, as_tuple=False).squeeze(1)

In [None]:
target.shape[0]

In [None]:
t2 = torch.arange(10, target.shape[0] + 1, 10)

In [None]:
t2

In [None]:
for start, stop in zip(t1, t2):
    pred[start+1:stop] = 79

In [None]:
pred

In [None]:
[pred[start+1:stop] = 79 for start, stop in zip(t1, t2)]

In [None]:
pad_indcies

In [None]:
pred[pad_indcies:pad_indcies] = 79

In [None]:
pred.shape

In [None]:
target.shape

In [None]:
accuracy(pred, target)

In [None]:
acc = (pred == target).sum().float() / target.shape[0]

In [None]:
acc