multimodal-auv-bathy-bnn-classifier / model_definitions.py

Add custom model definitions (model_definitions.py)

a7b33df verified 6 months ago

6.5 kB

	import torch
	import torch.nn as nn
	from torchvision.models import resnet50, ResNet50_Weights
	import torch.nn.functional as F
	from huggingface_hub import PyTorchModelHubMixin # Import the mixin

	# --- Custom Model Definitions ---

	class Identity(nn.Module):
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return x

	class AdditiveAttention(nn.Module):
	def __init__(self, d_model: int, hidden_dim: int = 128):
	super(AdditiveAttention, self).__init__()
	self.query_projection = nn.Linear(d_model, hidden_dim)
	self.key_projection = nn.Linear(d_model, hidden_dim)
	self.value_projection = nn.Linear(d_model, hidden_dim)
	self.attention_mechanism = nn.Linear(hidden_dim, hidden_dim) # Output hidden_dim

	def forward(self, query: torch.Tensor) -> torch.Tensor:
	keys = self.key_projection(query)
	values = self.value_projection(query)
	queries = self.query_projection(query)

	attention_scores = torch.tanh(queries + keys)
	attention_weights = F.softmax(self.attention_mechanism(attention_scores), dim=1)

	attended_values = values * attention_weights # Element-wise product
	return attended_values

	class ResNet50Custom(nn.Module, PyTorchModelHubMixin): # Inherit from PyTorchModelHubMixin
	def __init__(self, input_channels: int, num_classes: int, **kwargs):
	super(ResNet50Custom, self).__init__()

	# Store config for PyTorchModelHubMixin to serialize to config.json
	self.config = {
	"input_channels": input_channels,
	"num_classes": num_classes,
	**kwargs
	}

	self.input_channels = input_channels

	self.model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)

	self.model.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)

	# The final FC layer of ResNet50Custom will be used only when ResNet50Custom is a standalone classifier.
	# When used as a feature extractor within MultiModalModel, this layer will be temporarily replaced by Identity().
	self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.model(x)

	def get_feature_size(self) -> int:
	return self.model.fc.in_features


	class MultiModalModel(nn.Module, PyTorchModelHubMixin): # Inherit from PyTorchModelHubMixin
	def __init__(self,
	image_input_channels: int,
	bathy_input_channels: int,
	sss_input_channels: int,
	num_classes: int,
	attention_type: str = "scaled_dot_product",
	kwargs): # Added kwargs for mixin compatibility
	super(MultiModalModel, self).__init__()

	# Store config for PyTorchModelHubMixin to serialize to config.json
	self.config = {
	"image_input_channels": image_input_channels,
	"bathy_input_channels": bathy_input_channels,
	"sss_input_channels": sss_input_channels,
	"num_classes": num_classes,
	"attention_type": attention_type,
	**kwargs # Pass along any extra kwargs for mixin
	}

	# Instantiate feature extraction models inside MultiModalModel
	# Their final FC layers will be treated as Identity for feature extraction
	self.image_model_feat = ResNet50Custom(input_channels=image_input_channels, num_classes=num_classes)
	self.bathy_model_feat = ResNet50Custom(input_channels=bathy_input_channels, num_classes=num_classes)
	self.sss_model_feat = ResNet50Custom(input_channels=sss_input_channels, num_classes=num_classes)

	# The ResNet50's feature output size is 2048 before its final FC layer
	feature_dim = self.image_model_feat.get_feature_size() # Should be 2048

	# Attention layers (AdditiveAttention uses d_model and outputs hidden_dim)
	attention_hidden_dim = 128 # This matches your fc layer input calculation (3*128=384)
	self.attention_image = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)
	self.attention_bathy = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)
	self.attention_sss = AdditiveAttention(feature_dim, hidden_dim=attention_hidden_dim)

	# Final classification layers
	self.fc = nn.Linear(3 * attention_hidden_dim, 1284)
	self.fc1 = nn.Linear(1284, 32)
	# Ensure num_classes is int for the linear layer
	num_classes_int = int(num_classes)
	if not isinstance(num_classes_int, int):
	raise TypeError("num_classes must be an integer after casting")
	self.fc2 = nn.Linear(32, num_classes_int)
	self.attention_type = attention_type

	def forward(self, inputs: torch.Tensor, bathy_tensor: torch.Tensor, sss_image: torch.Tensor) -> torch.Tensor:
	# Temporarily replace the final FC layer of the feature extractors with Identity
	# to get the 2048 features, then restore them.
	original_image_fc = self.image_model_feat.model.fc
	original_bathy_fc = self.bathy_model_feat.model.fc
	original_sss_fc = self.sss_model_feat.model.fc

	self.image_model_feat.model.fc = Identity()
	self.bathy_model_feat.model.fc = Identity()
	self.sss_model_feat.model.fc = Identity()

	image_features = self.image_model_feat(inputs)
	bathy_features = self.bathy_model_feat(bathy_tensor)
	sss_features = self.sss_model_feat(sss_image)

	# Restore original FC layers on the feature extractors
	self.image_model_feat.model.fc = original_image_fc
	self.bathy_model_feat.model.fc = original_bathy_fc
	self.sss_model_feat.model.fc = original_sss_fc

	# Apply attention
	image_features_attended = self.attention_image(image_features)
	bathy_features_attended = self.attention_bathy(bathy_features)
	sss_features_attended = self.attention_sss(sss_features)

	# Concatenate attended features
	combined_features = torch.cat([image_features_attended, bathy_features_attended, sss_features_attended], dim=1)

	# Pass through final classification layers
	outputs_1 = self.fc(combined_features)
	output_2 = self.fc1(outputs_1)
	outputs = self.fc2(output_2)
	return outputs