Computer vision is a field of artificial intelligence that enables computers to interpret and understand visual information from the world. This guide covers fundamental concepts, classical algorithms, and modern deep learning approaches for image and video processing.
Overview
Computer vision has revolutionized how machines interact with visual data, enabling applications ranging from autonomous vehicles to medical image analysis. By combining classical image processing techniques with modern deep learning, we can build systems that understand, analyze, and generate visual content.
Key Applications:
- Image classification and object detection
- Facial recognition and biometric systems
- Autonomous vehicles and robotics
- Medical imaging and diagnostics
- Augmented and virtual reality
- Quality control and manufacturing
- Video surveillance and security
- Document analysis and OCR
Core Concepts:
- Image representation and color spaces
- Image filtering and enhancement
- Feature detection and description
- Object detection and tracking
- Image segmentation
- 3D reconstruction and depth estimation
- Neural networks for computer vision
Getting Started
Installation and Setup
Install essential computer vision libraries:
# Core libraries
pip install opencv-python opencv-contrib-python
pip install pillow
pip install numpy matplotlib
# Deep learning frameworks
pip install torch torchvision
pip install tensorflow
# Additional tools
pip install scikit-image
pip install imageio
pip install albumentations # Image augmentation
For GPU acceleration with CUDA:
# PyTorch with CUDA
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
# TensorFlow with GPU
pip install tensorflow[and-cuda]
Basic Image Operations
Loading and Displaying Images
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
# Load image with OpenCV (BGR format)
img_bgr = cv2.imread('image.jpg')
# Convert BGR to RGB
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
# Load with PIL
img_pil = Image.open('image.jpg')
# Load with matplotlib
img_plt = plt.imread('image.jpg')
# Display with matplotlib
plt.figure(figsize=(10, 6))
plt.imshow(img_rgb)
plt.axis('off')
plt.title('Image Display')
plt.show()
# Display with OpenCV
cv2.imshow('Image Window', img_bgr)
cv2.waitKey(0)
cv2.destroyAllWindows()
Image Properties
# Get image dimensions
height, width, channels = img_rgb.shape
print(f"Image size: {width}x{height}")
print(f"Channels: {channels}")
print(f"Data type: {img_rgb.dtype}")
print(f"Shape: {img_rgb.shape}")
# Get pixel value
pixel = img_rgb[100, 150] # [row, col]
print(f"Pixel at (150, 100): RGB = {pixel}")
# Modify pixel
img_rgb[100, 150] = [255, 0, 0] # Set to red
# Get image statistics
print(f"Min value: {img_rgb.min()}")
print(f"Max value: {img_rgb.max()}")
print(f"Mean value: {img_rgb.mean():.2f}")
print(f"Standard deviation: {img_rgb.std():.2f}")
Saving Images
# Save with OpenCV
cv2.imwrite('output.jpg', img_bgr)
cv2.imwrite('output.png', img_bgr, [cv2.IMWRITE_PNG_COMPRESSION, 9])
# Save with PIL
img_pil.save('output.jpg', quality=95)
img_pil.save('output.png', compress_level=9)
# Save with matplotlib
plt.imsave('output.png', img_rgb)
Working with Different Image Formats
from PIL import Image
import cv2
def convert_image_format(input_path, output_path, quality=95):
"""Convert image between formats."""
# Using PIL
img = Image.open(input_path)
# Convert mode if necessary
if img.mode == 'RGBA' and output_path.endswith('.jpg'):
img = img.convert('RGB')
img.save(output_path, quality=quality)
print(f"Converted {input_path} to {output_path}")
# Example conversions
convert_image_format('image.png', 'image.jpg')
convert_image_format('image.jpg', 'image.webp')
convert_image_format('image.bmp', 'image.png')
# Batch conversion
import os
from pathlib import Path
def batch_convert(input_dir, output_dir, output_format='jpg'):
"""Convert all images in directory to specified format."""
Path(output_dir).mkdir(parents=True, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.webp')):
input_path = os.path.join(input_dir, filename)
output_filename = Path(filename).stem + f'.{output_format}'
output_path = os.path.join(output_dir, output_filename)
convert_image_format(input_path, output_path)
# Convert all images to JPG
batch_convert('input_images/', 'output_images/', 'jpg')
Image Coordinate Systems
import numpy as np
import matplotlib.pyplot as plt
# Create sample image
img = np.zeros((400, 600, 3), dtype=np.uint8)
# OpenCV/NumPy coordinate system: (row, col) or (y, x)
# Origin (0,0) is at top-left corner
# Draw points at different locations
points = [
(50, 100, 'Top-Left Region'),
(200, 300, 'Center'),
(350, 500, 'Bottom-Right Region')
]
for y, x, label in points:
cv2.circle(img, (x, y), 5, (0, 255, 0), -1)
cv2.putText(img, label, (x+10, y), cv2.FONT_HERSHEY_SIMPLEX,
0.5, (255, 255, 255), 1)
# Draw coordinate axes
cv2.line(img, (0, 0), (100, 0), (255, 0, 0), 2) # X-axis (red)
cv2.line(img, (0, 0), (0, 100), (0, 0, 255), 2) # Y-axis (blue)
cv2.putText(img, 'X', (105, 10), cv2.FONT_HERSHEY_SIMPLEX,
0.7, (255, 0, 0), 2)
cv2.putText(img, 'Y', (5, 115), cv2.FONT_HERSHEY_SIMPLEX,
0.7, (0, 0, 255), 2)
plt.figure(figsize=(10, 6))
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.title('Image Coordinate System')
plt.xlabel('X (columns)')
plt.ylabel('Y (rows)')
plt.grid(True, alpha=0.3)
plt.show()
Region of Interest (ROI)
import cv2
# Load image
img = cv2.imread('image.jpg')
# Extract ROI using slicing [y1:y2, x1:x2]
roi = img[100:300, 150:350]
# Display ROI
cv2.imshow('Region of Interest', roi)
cv2.waitKey(0)
# Modify ROI (changes original image)
roi[:] = [0, 255, 0] # Set to green
# Copy ROI to another location
img[400:600, 150:350] = roi.copy()
# Extract ROI with boundary checking
def extract_roi(img, x, y, width, height):
"""Safely extract ROI with boundary checking."""
h, w = img.shape[:2]
# Ensure coordinates are within bounds
x1 = max(0, x)
y1 = max(0, y)
x2 = min(w, x + width)
y2 = min(h, y + height)
return img[y1:y2, x1:x2]
# Extract with bounds checking
safe_roi = extract_roi(img, 100, 150, 200, 150)
Image Channels
import cv2
import numpy as np
import matplotlib.pyplot as plt
# Load color image
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Split into channels
r, g, b = cv2.split(img_rgb)
# Or using NumPy indexing
r = img_rgb[:, :, 0]
g = img_rgb[:, :, 1]
b = img_rgb[:, :, 2]
# Display individual channels
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].imshow(img_rgb)
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')
axes[0, 1].imshow(r, cmap='Reds')
axes[0, 1].set_title('Red Channel')
axes[0, 1].axis('off')
axes[1, 0].imshow(g, cmap='Greens')
axes[1, 0].set_title('Green Channel')
axes[1, 0].axis('off')
axes[1, 1].imshow(b, cmap='Blues')
axes[1, 1].set_title('Blue Channel')
axes[1, 1].axis('off')
plt.tight_layout()
plt.show()
# Merge channels back
merged = cv2.merge([r, g, b])
# Create false color image (swap channels)
false_color = cv2.merge([b, r, g]) # BRG instead of RGB
# Set specific channel to zero
img_no_red = img_rgb.copy()
img_no_red[:, :, 0] = 0 # Remove red channel
img_no_green = img_rgb.copy()
img_no_green[:, :, 1] = 0 # Remove green channel
img_no_blue = img_rgb.copy()
img_no_blue[:, :, 2] = 0 # Remove blue channel
Basic Image Arithmetic
import cv2
import numpy as np
# Load images
img1 = cv2.imread('image1.jpg')
img2 = cv2.imread('image2.jpg')
# Ensure same size
img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
# Addition (with saturation)
added = cv2.add(img1, img2)
# NumPy addition (with wrapping)
added_np = img1 + img2 # May cause overflow
# Weighted addition (blending)
alpha = 0.7
beta = 0.3
blended = cv2.addWeighted(img1, alpha, img2, beta, 0)
# Subtraction
subtracted = cv2.subtract(img1, img2)
# Multiplication
multiplied = cv2.multiply(img1, img2)
# Division
divided = cv2.divide(img1, img2)
# Bitwise operations
bitwise_and = cv2.bitwise_and(img1, img2)
bitwise_or = cv2.bitwise_or(img1, img2)
bitwise_xor = cv2.bitwise_xor(img1, img2)
bitwise_not = cv2.bitwise_not(img1)
# Brightness adjustment
bright = cv2.add(img1, 50) # Increase brightness
dark = cv2.subtract(img1, 50) # Decrease brightness
# Contrast adjustment
contrast = cv2.multiply(img1, 1.5) # Increase contrast
# Clipping
def adjust_brightness_contrast(img, brightness=0, contrast=1.0):
"""Adjust brightness and contrast."""
adjusted = cv2.multiply(img, contrast)
adjusted = cv2.add(adjusted, brightness)
return np.clip(adjusted, 0, 255).astype(np.uint8)
result = adjust_brightness_contrast(img1, brightness=30, contrast=1.2)
Content Sections (To Be Completed)
The following sections will be developed progressively:
Color Spaces and Conversion
Content coming soon...
Image Filtering and Enhancement
Content coming soon...
Edge Detection
Content coming soon...
Feature Detection and Description
Content coming soon...
Object Detection
Content coming soon...
Image Segmentation
Content coming soon...
Face Detection and Recognition
Content coming soon...
Optical Character Recognition (OCR)
Content coming soon...
Video Processing
Content coming soon...
3D Vision and Depth Estimation
Content coming soon...
Deep Learning for Computer Vision
Content coming soon...
Real-World Applications
Content coming soon...
Best Practices
Content coming soon...