|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
from torchvision.models import resnet50, ResNet50_Weights
|
|
|
import torch.nn.functional as F
|
|
|
from huggingface_hub import PyTorchModelHubMixin
|
|
|
|
|
|
|
|
|
|
|
|
class Identity(nn.Module):
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
|
return x
|
|
|
|
|
|
class AdditiveAttention(nn.Module):
|
|
|
def __init__(self, d_model: int, hidden_dim: int = 128):
|
|
|
super(AdditiveAttention, self).__init__()
|
|
|
self.query_projection = nn.Linear(d_model, hidden_dim)
|
|
|
self.key_projection = nn.Linear(d_model, hidden_dim)
|
|
|
self.value_projection = nn.Linear(d_model, hidden_dim)
|
|
|
self.attention_mechanism = nn.Linear(hidden_dim, hidden_dim)
|
|
|
|
|
|
def forward(self, query: torch.Tensor) -> torch.Tensor:
|
|
|
keys = self.key_projection(query)
|
|
|
values = self.value_projection(query)
|
|
|
queries = self.query_projection(query)
|
|
|
|
|
|
attention_scores = torch.tanh(queries + keys)
|
|
|
attention_weights = F.softmax(self.attention_mechanism(attention_scores), dim=1)
|
|
|
|
|
|
attended_values = values * attention_weights
|
|
|
return attended_values
|
|
|
|
|
|
class ResNet50Custom(nn.Module, PyTorchModelHubMixin):
|
|
|
def __init__(self, input_channels: int, num_classes: int, **kwargs):
|
|
|
super(ResNet50Custom, self).__init__()
|
|
|
|
|
|
|
|
|
self.config = {
|
|
|
"input_channels": input_channels,
|
|
|
"num_classes": num_classes,
|
|
|
**kwargs
|
|
|
}
|
|
|
|
|
|
self.input_channels = input_channels
|
|
|
|
|
|
self.model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
|
|
|
|
|
|
self.model.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
|
|
|
|
|
|
|
|
|
|
|
|
self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
|
|
|
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
|
return self.model(x)
|
|
|
|
|
|
def get_feature_size(self) -> int:
|
|
|
return self.model.fc.in_features
|
|
|
|
|
|
|
|
|
class MultiModalModel(nn.Module, PyTorchModelHubMixin):
|
|
|
def __init__(self,
|
|
|
image_input_channels: int,
|
|
|
bathy_input_channels: int,
|
|
|
sss_input_channels: int,
|
|
|
num_classes: int,
|
|
|
attention_type: str = "scaled_dot_product",
|
|
|
**kwargs):
|
|
|
super(MultiModalModel, self).__init__()
|
|
|
|
|
|
|
|
|
self.config = {
|
|
|
"image_input_channels": image_input_channels,
|
|
|
"bathy_input_channels": bathy_input_channels,
|
|
|
"sss_input_channels": sss_input_channels,
|
|
|
"num_classes": num_classes,
|
|
|
"attention_type": attention_type,
|
|
|
**kwargs
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
self.image_model_feat = ResNet50Custom(input_channels=image_input_channels, num_classes=num_classes)
|
|
|
self.bathy_model_feat = ResNet50Custom(input_channels=bathy_input_channels, num_classes=num_classes)
|
|
|
self.sss_model_feat = ResNet50Custom(input_channels=sss_input_channels, num_classes=num_classes)
|
|
|
|
|
|
|
|
|
feature_dim = self.image_model_feat.get_feature_size()
|
|
|
|
|
|
|
|
|
attention_hidden_dim = 128
|
|
|
self.attention_image = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)
|
|
|
self.attention_bathy = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)
|
|
|
self.attention_sss = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)
|
|
|
|
|
|
|
|
|
self.fc = nn.Linear(3 * attention_hidden_dim, 1284)
|
|
|
self.fc1 = nn.Linear(1284, 32)
|
|
|
|
|
|
num_classes_int = int(num_classes)
|
|
|
if not isinstance(num_classes_int, int):
|
|
|
raise TypeError("num_classes must be an integer after casting")
|
|
|
self.fc2 = nn.Linear(32, num_classes_int)
|
|
|
self.attention_type = attention_type
|
|
|
|
|
|
def forward(self, inputs: torch.Tensor, bathy_tensor: torch.Tensor, sss_image: torch.Tensor) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
original_image_fc = self.image_model_feat.model.fc
|
|
|
original_bathy_fc = self.bathy_model_feat.model.fc
|
|
|
original_sss_fc = self.sss_model_feat.model.fc
|
|
|
|
|
|
self.image_model_feat.model.fc = Identity()
|
|
|
self.bathy_model_feat.model.fc = Identity()
|
|
|
self.sss_model_feat.model.fc = Identity()
|
|
|
|
|
|
image_features = self.image_model_feat(inputs)
|
|
|
bathy_features = self.bathy_model_feat(bathy_tensor)
|
|
|
sss_features = self.sss_model_feat(sss_image)
|
|
|
|
|
|
|
|
|
self.image_model_feat.model.fc = original_image_fc
|
|
|
self.bathy_model_feat.model.fc = original_bathy_fc
|
|
|
self.sss_model_feat.model.fc = original_sss_fc
|
|
|
|
|
|
|
|
|
image_features_attended = self.attention_image(image_features)
|
|
|
bathy_features_attended = self.attention_bathy(bathy_features)
|
|
|
sss_features_attended = self.attention_sss(sss_features)
|
|
|
|
|
|
|
|
|
combined_features = torch.cat([image_features_attended, bathy_features_attended, sss_features_attended], dim=1)
|
|
|
|
|
|
|
|
|
outputs_1 = self.fc(combined_features)
|
|
|
output_2 = self.fc1(outputs_1)
|
|
|
outputs = self.fc2(output_2)
|
|
|
return outputs |