Avoidable Pitfalls in NumPy for Data Analysis

Key Array Attributes Without Parentheses

arr.dtype
arr.shape   # yields a tuple
arr.size
arr.ndim    # number of dimensions

Reshaping vs Resizing: arr.reshape, arr.resize, np.resize

  • arr.reshape(dim1, dim2, ...) returns a new array without modifying the original, whereas arr.resize((dim1, dim2, ...)) alters the array in-place and returns nothing. np.resize(arr, (dim1, dim2, ...)) returns a fresh array while leaving the original intact. The parameter format differs: reshape takes multiple integer arguments, while resize expects a single tuple.
  • reshape enforces matching total element counts; a mismatch triggers a error. resize can expand or shrink the array shape—padding with repeated original data when enlarging, and discarding surplus data when shrinking.
  • In reshape, you can supply -1 for one dimension to let it infer the correct size from arr.size. resize does not validate this; when all other dimensions match the original count, -1 behaves like thoce dimensions, otherwise it defaults to 1.
a = np.array([[1, 2, 3], [4, 5, 6]])
print('initial array:')
print(a)
print('initial shape:')
print(a.shape)
b = np.resize(a, (3, 2))
print('second array:')
print(b)
print('first remains unchanged:')
print(a)
print('second shape:')
print(b.shape)
a.resize((3, 2))
print('in-place modification:')
print(a)
print('modified second array size:')
b = np.resize(a, (3, 3))
print(b)

arr = np.arange(12)
arr1 = arr.reshape(-1, 2, 2)   # -1 auto-calculates
print(len(arr1))               # 3
arr2 = np.resize(arr, (2, -1, 2))  # becomes (2, 2, 2)
print(arr2)
arr3 = np.resize(arr, (2, -1, 3))  # becomes (2, 1, 3)
print(arr3)

Min/Max with np.amin, np.amax, np.argmin, np.argmax

a = np.array([[3, 7, 5], [8, 4, 3], [2, 4, 9]])
print('original:')
print(a)
print('row minimums:')
print(np.amin(a, 1))          # equivalent to np.min
print('column minimums:')
print(np.amin(a, 0))
print('global max:')
print(np.amax(a))             # equivalent to np.max
print('max along columns:')
print(np.amax(a, axis=0))

Conditional Indexing: np.argwhere

indexName = np.argwhere(scoreAll == name)

Index Sorting: np.argsort

x = np.array([3, 1, 2])
print('original:')
print(x)
print('argsort result:')
y = np.argsort(x)
print(y)   # [1 2 0] — indices that would sort the array

Sorting Values: np.sort

matrix = np.array([[3, 7], [9, 1]])
print('original:')
print(matrix)
print('default sort (along last axis):')
print(np.sort(matrix))              # axis=1 implied
print('sort along columns (axis=0):')
print(np.sort(matrix, axis=0))

# Sorting with structured dtypes
dt = np.dtype([('name', 'S10'), ('age', int)])
people = np.array([('raju', 21), ('anil', 25), ('ravi', 17), ('amar', 27)], dtype=dt)
print(people)
print(np.sort(people))             # sorts by first field when order omitted
print(np.sort(people, order='age'))

# Without explicit dtype all entries become strings
raw = np.array([('raju', 21), ('anil', 25), ('ravi', 17), ('amar', 27)])
print(raw)
print(np.sort(raw))

Cumulative Sum: np.cumsum

arr = np.arange(6).reshape(2, 3)
print(arr)
print(np.cumsum(arr))         # flattened cumulative sum
print(np.cumsum(arr, axis=0)) # cumulative along rows
print(np.cumsum(arr, axis=1)) # cumulative along columns

Appending Data: np.append

base = np.array([[1, 2, 3], [4, 5, 6]])
print(base)
print('flat append:')
print(np.append(base, [7, 8, 9]))
print('append along axis 0:')
print(np.append(base, [[7, 8, 9]], axis=0))
print('append along axis 1:')
print(np.append(base, [[5, 5, 5], [6, 6, 6]], axis=1))

Inserting Elements: np.insert

board = np.array([[1, 2], [3, 4], [5, 6]])
print('initial:')
print(board)
print('without axis (flattened insert):')
print(np.insert(board, 3, [11, 12]))
print('insert row at position 1:')
print(np.insert(board, 1, [11, 12], axis=0))
print('insert column with scalar broadcast:')
print(np.insert(board, 1, 11, axis=1))

Deleting Elements: np.delete

base_grid = np.arange(12).reshape(3, 4)
print('grid:')
print(base_grid)
print('without axis (flattened deletion):')
print(np.delete(base_grid, 5))
print('delete second column:')
print(np.delete(base_grid, 1, axis=1))
print('using slice object:')
sequence = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(np.delete(sequence, np.s_[::3]))

Unique Sorted Values: np.unique

values = np.array([5, 2, 6, 2, 7, 5, 6, 8, 2, 9])
print('original values:')
print(values)
print('unique sorted values:')
unique_vals = np.unique(values)
print(unique_vals)
print('original indices of unique items:')
unique_vals, first_indices = np.unique(values, return_index=True)
print(first_indices)
print('inverse indices reconstruction:')
unique_vals, inverse_indices = np.unique(values, return_inverse=True)
print(unique_vals)

Conditional Extraction: np.extract

base_square = np.arange(100).reshape(10, 10)
odds = np.mod(base_square, 2) != 0
print(np.extract(odds, base_square))   # flattened result

Intersection: np.intersect1d

set_a = np.array([1, 1, 2, 5, 4])
set_b = np.array([2, 1, 4, 5])
common, idx_a, idx_b = np.intersect1d(set_a, set_b, return_indices=True)
print(common)   # [1 2 4 5]
print(idx_a)     # [0 2 4 3]
print(idx_b)     # [1 0 2 3]

Slicing with Boolean Masks

arr = np.arange(15).reshape(3, 5)
col_slice = arr[:, 1:2]          # or arr[..., 1:2]
print(col_slice)
region = arr[1:, 2:]
print(region)
coord_sel = arr[[0, 0, 2, 2], [1, 3, 1, 3]]
print(coord_sel)
mask_range = arr[(arr >= 5) & (arr <= 10)]
print(mask_range)
mask_outliers = arr[(arr < 5) | (arr > 10)]
print(mask_outliers)

# Boolean masking behaviour difference from DataFrame
matrix = np.arange(1, 10, 1).reshape([3, 3])
print(matrix > 4)
print(matrix[matrix > 4])  # flattens result, unlike pandas

# Using np.where for boolean filtering
series = np.arange(10)
print(series[np.where(~(series % 2 == 0))])  # [1 3 5 7 9]

Pseudo-Random Number Generation: np.random Methods

  • Uniform distributions
    • np.random.rand(size) — uniform over [0,1); size can be integers like 2 or (2, 3).
    • np.random.randint(low, high, size) — uniform integers in [low, high).
    • np.random.uniform(low, high, size) — uniform floats in [low, high).
  • Normal distributions
    • np.random.randn(size) — standard normal (mean 0, variance 1).
    • np.random.normal(loc, scale, size) — general normal with specified mean (loc) and stadnard deviation (scale).

Rounding: np.around

print(np.around(a, decimals=1))   # round to one decimal place
print(np.around(a, decimals=-1))  # round to nearest ten (-2 for hundred, and so on)

Loading CSV Data: loadtxt

records = np.loadtxt('iris_data.csv')  # space is the default delimiter

Tags: Numpy Data Analysis python array manipulation Common Mistakes

Posted on Fri, 19 Jun 2026 18:22:00 +0000 by strago