import numpy as np

# Scalars
# Definition: A single number (0D), e.g., a learning rate in AI models
learning_rate = 0.01  # Scalar for gradient descent step size
print("Scalar (Learning Rate):", learning_rate)
# Real-world: Controls how fast a neural network learns
# Operation: Simple arithmetic
scaled_value = learning_rate * 100
print("Scaled Scalar:", scaled_value)

# Vectors
# Definition: 1D array, e.g., feature vector in machine learning
feature_vector = np.array([0.5, 0.8, 0.2])  # Represents a data point (e.g., customer purchase history)
print("\nVector (Feature Vector):", feature_vector)
# Real-world: Used in recommendation systems or word embeddings
# Operation: Dot product (measures similarity)
another_vector = np.array([0.1, 0.4, 0.7])
dot_product = np.dot(feature_vector, another_vector)
print("Dot Product:", dot_product)

# Matrices
# Definition: 2D array, e.g., for image data or linear transformations
image_patch = np.array([[1, 2, 3], [4, 5, 6]])  # 2x3 matrix (e.g., grayscale image patch)
print("\nMatrix (Image Patch):\n", image_patch)
# Real-world: Used in CNNs for image processing
# Operation: Matrix multiplication (e.g., for transformations)
weights = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])  # 3x2 weight matrix
result = np.matmul(image_patch, weights)  # 2x2 result
print("Matrix Multiplication Result:\n", result)

# Tensors
# Definition: nD array (n≥0), e.g., for RGB images or video data
rgb_image = np.random.rand(224, 224, 3)  # 3D tensor (height, width, RGB channels)
print("\nTensor (RGB Image Shape):", rgb_image.shape)
# Real-world: Input to CNNs like ResNet for image classification
# Operation: Tensor slicing (e.g., extract red channel)
red_channel = rgb_image[:, :, 0]  # First channel (R)
print("Red Channel Shape:", red_channel.shape)
# Example: 4D tensor for a batch of images
batch_images = np.random.rand(2, 4, 4, 3)  # Batch of 2 RGB 4x4 images
print("Batch Tensor:", batch_images)

Scalar (Learning Rate): 0.01
Scaled Scalar: 1.0

Vector (Feature Vector): [0.5 0.8 0.2]
Dot Product: 0.51

Matrix (Image Patch):
 [[1 2 3]
 [4 5 6]]
Matrix Multiplication Result:
 [[2.2 2.8]
 [4.9 6.4]]

Tensor (RGB Image Shape): (224, 224, 3)
Red Channel Shape: (224, 224)
Batch Tensor Shape: [[[[0.04087357 0.57134536 0.32519568]
   [0.34167714 0.62335238 0.46628049]
   [0.07720674 0.77133272 0.36870046]
   [0.72117031 0.04701737 0.86162155]]

  [[0.85597285 0.14884848 0.48867849]
   [0.85831712 0.9510796  0.58200469]
   [0.57347744 0.24808471 0.41284116]
   [0.8178081  0.97333519 0.10092528]]

  [[0.04708062 0.04044213 0.22104739]
   [0.08955609 0.82080325 0.11358177]
   [0.91882124 0.60249004 0.72110625]
   [0.12282592 0.6878657  0.75100109]]

  [[0.79111113 0.02168534 0.89053718]
   [0.62379676 0.28582163 0.36874591]
   [0.29939015 0.49078382 0.96479222]
   [0.9737543  0.02131901 0.03780112]]]


 [[[0.89632584 0.97437265 0.16876449]
   [0.64941646 0.63884824 0.98995114]
   [0.44964252 0.72292501 0.44936663]
   [0.7425726  0.38513847 0.28983086]]

  [[0.31092603 0.88805527 0.1764936 ]
   [0.16567742 0.49115005 0.49396291]
   [0.01596826 0.98731473 0.70769769]
   [0.25895077 0.37736448 0.73386227]]

  [[0.09645565 0.36557884 0.39174251]
   [0.79035446 0.33778424 0.47940533]
   [0.64473504 0.038141   0.17750133]
   [0.78799436 0.35127631 0.31383826]]

  [[0.02208777 0.60393952 0.44482964]
   [0.29335764 0.16871938 0.34279193]
   [0.9368042  0.80870572 0.74452877]
   [0.67187876 0.11662868 0.86089061]]]]


import numpy as np

def gram_schmidt(vectors):
    orthonormal_set = []
    for v in vectors:
        w = v.copy()
        for u in orthonormal_set:
            proj = np.dot(w, u) * u
            w = w - proj
        norm = np.linalg.norm(w)
        if norm == 0:
            continue
        orthonormal_set.append(w / norm)
    return np.array(orthonormal_set)

# Example
a1 = np.array([3, 1])
a2 = np.array([2, 2])
vectors = [a1, a2]

Q = gram_schmidt(vectors)
print("Orthonormal basis:")
print(Q)


import numpy as np

# Example data matrix (3 variables, 4 observations)
X = np.array([[2, 4, 3],
              [4, 5, 6],
              [3, 7, 8],
              [6, 8, 9]])

# Compute Covariance Matrix
cov_matrix = np.cov(X, rowvar=False)

# Compute Correlation Matrix
corr_matrix = np.corrcoef(X, rowvar=False)

print("Covariance Matrix:\n", cov_matrix)
print("Correlation Matrix:\n", corr_matrix)

Covariance Matrix:
 [[2.91666667 2.33333333 3.5       ]
 [2.33333333 3.33333333 4.66666667]
 [3.5        4.66666667 7.        ]]
Correlation Matrix:
 [[1.         0.74833148 0.77459667]
 [0.74833148 1.         0.96609178]
 [0.77459667 0.96609178 1.        ]]


import numpy as np

# Example matrix A
A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Perform Singular Value Decomposition (SVD)
U, Sigma, Vt = np.linalg.svd(A)

# Reconstruct the matrix
A_reconstructed = np.dot(U, np.dot(np.diag(Sigma), Vt))

print("Original Matrix:\n", A)
print("Reconstructed Matrix:\n", A_reconstructed)

Type	Property Example
Square	Same number of rows and columns
Diagonal	Non-zero entries only on diagonal
Symmetric	$A = A^T$
Orthogonal	$A^T A = I$ (columns are orthonormal)
Zero Matrix	All elements are zero

Type	Form	Use Case
LU	$A = LU$	Solving systems
QR	$A = QR$	Least squares, orthogonal bases
SVD	$A = U \Sigma V^T$	PCA, compression, LSA
Eigendecomp	$A = V \Lambda V^{-1}$	PCA, diagonalization

Matrix Concept	Application
Multiplication	Neural networks, transformations
Transpose	Covariance, dot product
Inverse	Solving linear systems
Rank	Dimensionality and redundancy
SVD/PCA	Dimensionality reduction

Norm	Definition	Interpretation	Example/Notes
Frobenius Norm $\|A\|_F$	$\sqrt{ \sum_{i,j} a_{ij}^2 }$	Like Euclidean norm for matrices	Used a lot in ML
1-Norm $\|A\|_1$	Max absolute column sum	Worst-case vertical stretching	$\max_j \sum_i	a_{ij}	$
Infinity Norm $\|A\|_\infty$	Max absolute row sum	Worst-case horizontal stretching	$\max_i \sum_j	a_{ij}	$
2-Norm (Spectral Norm) $\|A\|_2$	Largest singular value of $A$	Max stretching factor along any direction	Related to principal component, SVD, etc.

Use Case	Impact
Solving Linear Systems	High condition number → errors get amplified
Inverting Matrices	Poor conditioning → instability
Gradient Descent	Ill-conditioned Hessian → slow convergence
Deep Learning	Can cause vanishing/exploding gradients

Scenario	Why use Pseudoinverse
$A$ is not square	So no usual inverse exists
$A$ is not full-rank	Singular matrix (can’t invert)
You want a least-squares solution to $Ax = b$	Regression, ML, Optimization

Use Case	Why it helps
Linear Regression	$\theta = (X^T X)^{-1} X^T y$ becomes $\theta = X^+ y$ when $X$ isn’t full-rank
Dimensionality Reduction	In SVD, pseudoinverse helps in low-rank approximations
Deep Learning	Used in layer inversion, autoencoders, or backprop through linear layers
Control Systems	Solve under/overdetermined linear systems

Use Case	Use
Element-wise operations in ML (e.g., attention, masking)	✅ Hadamard
Creating structured large matrices from small ones	✅ Kronecker
Efficient parameter sharing in deep learning (e.g., tensor compression)	✅ Kronecker

Space	Description
$\mathbb{R}^n$	n-dimensional real vectors
Matrices	All $m \times n$ real matrices
Polynomials	Polynomials of degree ≤ n
Functions	All continuous real functions

Concept	Application
PCA (Principal Component Analysis)	Find directions (eigenvectors) of max variance
Spectral clustering	Eigenvectors of graph Laplacian for clustering
Stability analysis	Eigenvalues determine system behavior
Markov chains	Eigenvalues define steady states
Neural Networks	Eigenvalues relate to Hessian matrix in optimization

Transformation	Matrix $A$	Effect
Identity	$I = \begin{bmatrix} 1 & 0 \\ 0 & 1 \end{bmatrix}$	Leaves vector unchanged
Scaling	$\begin{bmatrix} s & 0 \\ 0 & s \end{bmatrix}$	Enlarges or shrinks vectors
Rotation (2D)	$\begin{bmatrix} \cos\theta & -\sin\theta \\ \sin\theta & \cos\theta \end{bmatrix}$	Rotates vector by $\theta$
Reflection (about x)	$\begin{bmatrix} 1 & 0 \\ 0 & -1 \end{bmatrix}$	Flips over x-axis
Projection onto x-axis	$\begin{bmatrix} 1 & 0 \\ 0 & 0 \end{bmatrix}$	Projects onto horizontal line

Property	Meaning
Linearity	Preserves addition and scalar multiplication
Composable	$T_1(T_2(\vec{x})) = (T_1 \circ T_2)(\vec{x})$
Invertible	Exists $T^{-1}$ such that $T^{-1}(T(\vec{x})) = \vec{x}$
Determined by action on basis	Knowing $T(\vec{e}_i)$ is enough to define $T$

Property	Holds If...
One-to-One	$\text{ker}(A) = \{\mathbf{0}\}$
Onto	Columns of $A$ span $\mathbb{R}^m$
Invertible	$A$ is square and full-rank (no zero eigenvalues)

Use Case	Description
PCA	Projects data to directions of max variance
Feature transformation	Linear mappings in neural networks
Projections	Reduce dimensions while preserving structure
Affine transformations	Linear + translation used in computer vision

Space	Inner Product Formula
$\mathbb{R}^n$	$\sum u_i v_i$
Complex vector space	$\sum u_i \overline{v_i}$
Function space (e.g. $L^2$)	$\langle f, g \rangle = \int_a^b f(x)g(x)\, dx$

Application	Role of Inner Product
PCA / SVD	Finding directions with high variance (via orthogonality)
Kernel methods (SVM)	Generalized inner products via kernel trick
Cosine similarity	Uses normalized inner product for text/image comparison
Orthogonalization	Gram-Schmidt in inner product spaces

Application	Use of Orthogonality & Orthonormality
PCA	Principal components are orthogonal
Fourier Transform	Basis functions are orthonormal
Gram-Schmidt	Converts a basis into orthonormal basis
Neural Networks	Weight initialization (orthogonal init)
QR Decomposition	$A = QR$, $Q$ is orthonormal matrix
SVD	$U$ and $V$ matrices are orthonormal

Property	Description
Symmetry	Covariance matrix is symmetric: $\Sigma = \Sigma^T$.
Diagonal elements	Represent the variance of each variable.
Off-diagonal elements	Represent the covariance between pairs of variables.
Positive semi-definiteness	Covariance matrix is always positive semi-definite.
Units	Covariance is in the units of the product of the two variables.

Property	Description
Symmetry	Correlation matrix is symmetric: $R = R^T$.
Range	The values of the correlation coefficient range between -1 and 1:

Aspect	Covariance Matrix	Correlation Matrix
Scaling	Depends on the units of the variables.	Unit-less (scaled).
Range of values	Can take any value between $-\infty$ and $\infty$.	Values range from -1 to 1.
Interpretation	Direct measure of joint variability.	Normalized measure of linear relationship.
Use	Useful for understanding the variance-covariance structure.	Useful for comparing the strength of linear relationships across different pairs.

Application	Description
PCA (Principal Component Analysis)	Uses covariance matrix to find directions of maximum variance.
Portfolio Optimization	Correlation between assets is used to diversify risk.
Multivariate Analysis	Analyzing relationships and dependencies between multiple variables.
Regression Analysis	Correlation and covariance used to evaluate predictor variables.
Machine Learning	Feature selection and dimensionality reduction based on correlation.

Use Case	Technique	Description
Recommendation Systems	SVD, NMF	Factorizes user-item interaction matrix to predict ratings
Dimensionality Reduction	SVD, PCA	Reduce dimensionality while preserving variance
Topic Modeling	NMF	Factorizes text data into topics, each represented as a combination of words
Image Compression	SVD, NMF	Decomposes image matrix to reduce storage while preserving features
Signal Processing	NMF, SVD	Decomposes signals into components for analysis or denoising
Data Compression	SVD, NMF	Reduces data size while retaining most important features

Property	Description
Low-rank approximation	Factorization approximates matrix using fewer components
Sparsity	NMF typically enforces sparsity (non-negative elements)
Uniqueness	In general, matrix factorization does not yield a unique solution without constraints
Computational Complexity	Factorization techniques like SVD are computationally expensive (especially for large matrices)
Interpretability	Factorized matrices (especially NMF) are often easier to interpret (e.g., topics, latent features)

Factorization Method	Best Use Case	Key Advantage	Limitation
SVD	Dimensionality reduction, PCA	Provides exact decomposition	Computationally expensive for large matrices
NMF	Text mining, topic modeling	Interpretability (non-negative factors)	Can only handle non-negative data
LU	Solving systems of linear equations	Fast for square matrices	Requires square matrices
QR	Solving least-squares problems	Computationally stable	Not ideal for large-scale systems

Mathematics for AI: Linear Algebra¶

Scalars, Vectors, Matrices, Tensors¶

Scalars¶

Vectors¶

Matrices¶

Matrix Basics¶

Matrix Operations¶

Special Matrices¶

Matrix Inverse¶

Determinant¶

Rank¶

Trace¶

Eigenvalues and Eigenvectors¶

Matrix Decompositions¶

Applications in ML/AI¶

Matrix Norms & Condition Number¶

What is a Matrix Norm?¶

Common Types of Matrix Norms¶

Example (Frobenius Norm):¶

Condition Number¶

Definition¶

In terms of singular values:¶

⚠️ Why It Matters in ML?¶

Moore-Penrose Pseudoinverse¶

What is the Moore-Penrose Pseudoinverse?¶

Denoted by:¶

When Do We Use It?¶

Mathematical Formulation¶

Computation Using SVD (ML Use Case)¶

Least-Squares Solution (ML Use Case)¶

Real ML Applications¶

Pro Tip:¶

Kronecker Product & Hadamard Product¶

What is the Kronecker Product?¶

Notation¶

How It Works — Example¶

What is the Hadamard Product?¶

Notation¶

Example¶

Applications in ML & Deep Learning¶

Recap¶

Tensors¶

Vector Spaces and Subspaces¶

Vectors¶

Vector Space (Linear Space)¶

Subspace¶

Real World ML/AI Examples¶

Span & Linear Combination¶

Linear Independence¶

Basis¶

Dimension¶

Projection of a Vector¶

Eigenvalues and Eigenvectors¶

What are Eigenvalues and Eigenvectors?¶

How to Compute¶

Intuitions & Properties¶

Use Cases of Eigenvalues & Eigenvectors¶

Diagonalization¶

Spectral Theorem¶

Linear Transformations¶

What Is a Linear Transformation?¶

Matrix Representation of a Linear Transformation¶

Common Examples of Linear Transformations¶

Kernel and Image¶

Properties of Linear Transformations¶

Application in ML and Dimensionality Reduction¶

Affine vs. Linear¶

Inner Product Spaces¶

What Is an Inner Product Space?¶

Inner Product (Dot Product in $\mathbb{R}^n$)¶

Inner Product Axioms¶

Norm from Inner Product¶

Orthogonality in Inner Product Spaces¶

Angle Between Vectors¶

Projection of a Vector¶

Inner Product Examples¶

Applications in ML & Data Science¶

Orthogonality and Orthonormality¶

Definition¶

Dot Product (Inner Product)¶

Gradient Descent & Its Variants ¶