- Computer Vision
- Overview
- Computer vision enables machines to understand visual information from images and videos, powering applications like autonomous driving, medical imaging, and surveillance.
- When to Use
- Image classification and object recognition tasks
- Object detection and localization in images
- Semantic or instance segmentation projects
- Pose estimation and human activity recognition
- Face recognition and biometric systems
- Medical imaging analysis and diagnostics
- Computer Vision Tasks
- Image Classification
-
- Categorizing images into classes
- Object Detection
-
- Locating and classifying objects in images
- Semantic Segmentation
-
- Pixel-level classification
- Instance Segmentation
-
- Detecting individual object instances
- Pose Estimation
-
- Identifying human body joints
- Face Recognition
-
- Identifying individuals in images
- Popular Architectures
- Classification
-
- ResNet, VGG, EfficientNet, Vision Transformer
- Detection
-
- YOLO, Faster R-CNN, SSD, RetinaNet
- Segmentation
-
- U-Net, DeepLab, Mask R-CNN
- Pose
- OpenPose, PoseNet, HRNet Python Implementation import numpy as np import pandas as pd import matplotlib . pyplot as plt import matplotlib . patches as patches from PIL import Image , ImageDraw import torch import torch . nn as nn from torch . utils . data import DataLoader , TensorDataset from torchvision import transforms , models , datasets import tensorflow as tf from tensorflow import keras from tensorflow . keras import layers import cv2 from sklearn . metrics import accuracy_score , confusion_matrix import seaborn as sns import warnings warnings . filterwarnings ( 'ignore' ) print ( "=== 1. Image Classification CNN ===" )
Define image classification model
class ImageClassifierCNN ( nn . Module ) : def init ( self , num_classes = 10 ) : super ( ) . init ( ) self . features = nn . Sequential ( nn . Conv2d ( 3 , 32 , kernel_size = 3 , padding = 1 ) , nn . ReLU ( inplace = True ) , nn . BatchNorm2d ( 32 ) , nn . MaxPool2d ( 2 , 2 ) , nn . Conv2d ( 32 , 64 , kernel_size = 3 , padding = 1 ) , nn . ReLU ( inplace = True ) , nn . BatchNorm2d ( 64 ) , nn . MaxPool2d ( 2 , 2 ) , nn . Conv2d ( 64 , 128 , kernel_size = 3 , padding = 1 ) , nn . ReLU ( inplace = True ) , nn . BatchNorm2d ( 128 ) , nn . MaxPool2d ( 2 , 2 ) , ) self . classifier = nn . Sequential ( nn . Linear ( 128 * 4 * 4 , 256 ) , nn . ReLU ( inplace = True ) , nn . Dropout ( 0.5 ) , nn . Linear ( 256 , num_classes ) ) def forward ( self , x ) : x = self . features ( x ) x = x . view ( x . size ( 0 ) , - 1 ) x = self . classifier ( x ) return x model = ImageClassifierCNN ( num_classes = 10 ) print ( f"Model parameters: { sum ( p . numel ( ) for p in model . parameters ( ) ) : , } " )
2. Object Detection setup
print ( "\n=== 2. Object Detection Framework ===" ) class ObjectDetector ( nn . Module ) : def init ( self ) : super ( ) . init ( )
Backbone
self . backbone = nn . Sequential ( nn . Conv2d ( 3 , 32 , 3 , padding = 1 ) , nn . ReLU ( ) , nn . MaxPool2d ( 2 , 2 ) , nn . Conv2d ( 32 , 64 , 3 , padding = 1 ) , nn . ReLU ( ) , nn . MaxPool2d ( 2 , 2 ) , )
Bounding box regression
self . bbox_head = nn . Sequential ( nn . Linear ( 64 * 8 * 8 , 128 ) , nn . ReLU ( ) , nn . Linear ( 128 , 4 )
x, y, w, h
)
Class prediction
self . class_head = nn . Sequential ( nn . Linear ( 64 * 8 * 8 , 128 ) , nn . ReLU ( ) , nn . Linear ( 128 , 10 )
10 classes
) def forward ( self , x ) : features = self . backbone ( x ) features_flat = features . view ( features . size ( 0 ) , - 1 ) bboxes = self . bbox_head ( features_flat ) classes = self . class_head ( features_flat ) return bboxes , classes detector = ObjectDetector ( ) print ( f"Detector parameters: { sum ( p . numel ( ) for p in detector . parameters ( ) ) : , } " )
3. Semantic Segmentation
print ( "\n=== 3. Semantic Segmentation U-Net ===" ) class UNet ( nn . Module ) : def init ( self , num_classes = 5 ) : super ( ) . init ( )
Encoder
self . enc1 = self . _conv_block ( 3 , 32 ) self . pool1 = nn . MaxPool2d ( 2 , 2 ) self . enc2 = self . _conv_block ( 32 , 64 ) self . pool2 = nn . MaxPool2d ( 2 , 2 )
Bottleneck
self . bottleneck = self . _conv_block ( 64 , 128 )
Decoder
self . upconv2 = nn . ConvTranspose2d ( 128 , 64 , 2 , stride = 2 ) self . dec2 = self . _conv_block ( 128 , 64 ) self . upconv1 = nn . ConvTranspose2d ( 64 , 32 , 2 , stride = 2 ) self . dec1 = self . _conv_block ( 64 , 32 )
Final output
self . out = nn . Conv2d ( 32 , num_classes , 1 ) def _conv_block ( self , in_channels , out_channels ) : return nn . Sequential ( nn . Conv2d ( in_channels , out_channels , 3 , padding = 1 ) , nn . ReLU ( inplace = True ) , nn . Conv2d ( out_channels , out_channels , 3 , padding = 1 ) , nn . ReLU ( inplace = True ) ) def forward ( self , x ) : enc1 = self . enc1 ( x ) enc2 = self . enc2 ( self . pool1 ( enc1 ) ) bottleneck = self . bottleneck ( self . pool2 ( enc2 ) ) dec2 = self . dec2 ( torch . cat ( [ self . upconv2 ( bottleneck ) , enc2 ] , 1 ) ) dec1 = self . dec1 ( torch . cat ( [ self . upconv1 ( dec2 ) , enc1 ] , 1 ) ) return self . out ( dec1 ) unet = UNet ( num_classes = 5 ) print ( f"U-Net parameters: { sum ( p . numel ( ) for p in unet . parameters ( ) ) : , } " )
4. Transfer Learning
print ( "\n=== 4. Transfer Learning with Pre-trained Models ===" ) try :
Load pre-trained ResNet18
pretrained_model
models . resnet18 ( pretrained = True ) num_ftrs = pretrained_model . fc . in_features pretrained_model . fc = nn . Linear ( num_ftrs , 10 ) print ( f"Pre-trained ResNet18 adapted for 10 classes" ) print ( f"Parameters: { sum ( p . numel ( ) for p in pretrained_model . parameters ( ) ) : , } " ) except : print ( "Pre-trained models not available" )
5. Image preprocessing and augmentation
print ( "\n=== 5. Image Preprocessing and Augmentation ===" ) transform_basic = transforms . Compose ( [ transforms . Resize ( ( 224 , 224 ) ) , transforms . ToTensor ( ) , transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ] , std = [ 0.229 , 0.224 , 0.225 ] ) ] ) transform_augmented = transforms . Compose ( [ transforms . RandomRotation ( 20 ) , transforms . RandomHorizontalFlip ( ) , transforms . ColorJitter ( brightness = 0.2 , contrast = 0.2 ) , transforms . RandomAffine ( degrees = 0 , translate = ( 0.1 , 0.1 ) ) , transforms . Resize ( ( 224 , 224 ) ) , transforms . ToTensor ( ) , transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ] , std = [ 0.229 , 0.224 , 0.225 ] ) ] ) print ( "Augmentation transforms defined" )
6. Synthetic image data
print ( "\n=== 6. Synthetic Image Data Creation ===" ) def create_synthetic_images ( num_images = 100 , img_size = 32 ) : """Create synthetic images with shapes""" images = [ ] labels = [ ] for _ in range ( num_images ) : img = np . ones ( ( img_size , img_size , 3 ) ) * 255
Randomly draw shapes
shape_type
np . random . randint ( 0 , 3 ) if shape_type == 0 :
Circle
center
( np . random . randint ( 5 , img_size - 5 ) , np . random . randint ( 5 , img_size - 5 ) ) radius = np . random . randint ( 3 , 10 ) cv2 . circle ( img , center , radius , ( 0 , 0 , 0 ) , - 1 ) labels . append ( 0 ) elif shape_type == 1 :
Rectangle
pt1
( np . random . randint ( 0 , img_size - 10 ) , np . random . randint ( 0 , img_size - 10 ) ) pt2 = ( pt1 [ 0 ] + np . random . randint ( 5 , 15 ) , pt1 [ 1 ] + np . random . randint ( 5 , 15 ) ) cv2 . rectangle ( img , pt1 , pt2 , ( 0 , 0 , 0 ) , - 1 ) labels . append ( 1 ) else :
Triangle
pts
np . array ( [ [ np . random . randint ( 0 , img_size ) , np . random . randint ( 0 , img_size ) ] , [ np . random . randint ( 0 , img_size ) , np . random . randint ( 0 , img_size ) ] , [ np . random . randint ( 0 , img_size ) , np . random . randint ( 0 , img_size ) ] ] ) cv2 . drawContours ( img , [ pts ] , 0 , ( 0 , 0 , 0 ) , - 1 ) labels . append ( 2 ) images . append ( img . astype ( np . float32 ) / 255.0 ) return np . array ( images ) , np . array ( labels ) X_images , y_labels = create_synthetic_images ( num_images = 300 , img_size = 32 ) print ( f"Synthetic dataset: { X_images . shape } , Labels: { y_labels . shape } " ) print ( f"Class distribution: { np . bincount ( y_labels ) } " )
7. Visualization
print ( "\n=== 7. Visualization ===" ) fig , axes = plt . subplots ( 3 , 3 , figsize = ( 12 , 10 ) )
Display synthetic images
for i in range ( 9 ) : idx = i % len ( X_images ) axes [ i // 3 , i % 3 ] . imshow ( X_images [ idx ] ) axes [ i // 3 , i % 3 ] . set_title ( f"Class { y_labels [ idx ] } " ) axes [ i // 3 , i % 3 ] . axis ( 'off' ) plt . suptitle ( "Synthetic Image Dataset" , fontsize = 14 , fontweight = 'bold' ) plt . tight_layout ( ) plt . savefig ( 'synthetic_images.png' , dpi = 100 , bbox_inches = 'tight' ) print ( "Synthetic images saved as 'synthetic_images.png'" )
8. Model architectures comparison
print ( "\n=== 8. Architecture Comparison ===" ) architectures_info = { 'CNN' : ImageClassifierCNN ( ) , 'ObjectDetector' : ObjectDetector ( ) , 'U-Net' : UNet ( ) , } arch_data = { 'Architecture' : list ( architectures_info . keys ( ) ) , 'Parameters' : [ sum ( p . numel ( ) for p in m . parameters ( ) ) for m in architectures_info . values ( ) ] , 'Use Case' : [ 'Classification' , 'Object Detection' , 'Segmentation' ] } arch_df = pd . DataFrame ( arch_data ) print ( "\nArchitecture Comparison:" ) print ( arch_df . to_string ( index = False ) )
Visualization
fig , axes = plt . subplots ( 1 , 2 , figsize = ( 14 , 5 ) )
Parameters comparison
axes [ 0 ] . barh ( arch_df [ 'Architecture' ] , arch_df [ 'Parameters' ] , color = 'steelblue' ) axes [ 0 ] . set_xlabel ( 'Number of Parameters' ) axes [ 0 ] . set_title ( 'Model Complexity Comparison' ) axes [ 0 ] . set_xscale ( 'log' )
Use cases
use_cases
[ 'Classification' , 'Detection' , 'Segmentation' , 'Classification' , 'Detection' , 'Segmentation' ] colors_map = { 'Classification' : 'green' , 'Detection' : 'orange' , 'Segmentation' : 'red' } bar_colors = [ colors_map [ uc ] for uc in arch_df [ 'Use Case' ] ] axes [ 1 ] . bar ( arch_df [ 'Architecture' ] , [ 1 , 1 , 1 ] , color = bar_colors , alpha = 0.7 ) axes [ 1 ] . set_ylabel ( 'Primary Task' ) axes [ 1 ] . set_title ( 'Architecture Use Cases' ) axes [ 1 ] . set_ylim ( [ 0 , 1.5 ] ) plt . tight_layout ( ) plt . savefig ( 'cv_architecture_comparison.png' , dpi = 100 , bbox_inches = 'tight' ) print ( "\nArchitecture comparison saved as 'cv_architecture_comparison.png'" )
9. Bounding box visualization
print ( "\n=== 9. Bounding Box Visualization ===" ) fig , ax = plt . subplots ( figsize = ( 10 , 8 ) ) ax . imshow ( X_images [ 0 ] )
Draw sample bounding boxes
bboxes
[ ( 5 , 5 , 15 , 15 ) ,
x1, y1, x2, y2
- (
- 18
- ,
- 10
- ,
- 28
- ,
- 20
- )
- ,
- (
- 8
- ,
- 20
- ,
- 18
- ,
- 28
- )
- ]
- for
- bbox
- in
- bboxes
- :
- rect
- =
- patches
- .
- Rectangle
- (
- (
- bbox
- [
- 0
- ]
- ,
- bbox
- [
- 1
- ]
- )
- ,
- bbox
- [
- 2
- ]
- -
- bbox
- [
- 0
- ]
- ,
- bbox
- [
- 3
- ]
- -
- bbox
- [
- 1
- ]
- ,
- linewidth
- =
- 2
- ,
- edgecolor
- =
- 'red'
- ,
- facecolor
- =
- 'none'
- )
- ax
- .
- add_patch
- (
- rect
- )
- ax
- .
- set_title
- (
- 'Bounding Box Detection Example'
- )
- ax
- .
- axis
- (
- 'off'
- )
- plt
- .
- savefig
- (
- 'bounding_boxes.png'
- ,
- dpi
- =
- 100
- ,
- bbox_inches
- =
- 'tight'
- )
- (
- "Bounding box visualization saved as 'bounding_boxes.png'"
- )
- (
- "\nComputer vision setup completed!"
- )
- Common CV Architectures
- Classification
-
- ResNet, EfficientNet, Vision Transformer
- Detection
-
- YOLO v5, Faster R-CNN, RetinaNet
- Segmentation
-
- U-Net, DeepLab v3, Mask R-CNN
- Tracking
-
- SORT, DeepSORT, ByteTrack
- Image Preprocessing
- Resizing to standard dimensions
- Normalization with ImageNet stats
- Data augmentation (rotation, flip, crop)
- Color space conversion
- Evaluation Metrics
- Classification
-
- Accuracy, Precision, Recall, F1
- Detection
-
- mAP (mean Average Precision), IoU
- Segmentation
- IoU, Dice coefficient, Hausdorff distance Deliverables Trained vision model Inference pipeline Performance evaluation Visualization results Model optimization report Deployment guide