Data Science Flash Cards for Python

0.0(0)

Studied by 4 people

Knowt Play

Learn

Practice Test

Spaced Repetition

Match

Flashcards

Card Sorting

1/110

Earn XP

Description and Tags

Generated by Clajude

Study Analytics

Name	Mastery	Learn	Test	Matching	Spaced

No study sessions yet.

111 Terms

New cards

How do you create variables and check their types?

name = 'John'
age = 25

print(type(name), type(age))
>>> <class 'str'> <class 'int'>

New cards

How do you create and manipulate lists?

fruits=['apple','banana']
fruits.append('orange')
fruits.extend(['grape','kiwi'])
print(fruits[0]) # First element
print(fruits[-1]) # Last element
>>>apple
>>>kiwi

New cards

How do you create and access dictionary data?

student = {'name': 'Alice', 'age': 22}

print(student['name'])

student['grade'] = 'A'

print(student.get('city', 'Unknown'))

>>>Alice
>>>Unknown

New cards

How do you iterate through data with for loops?

# Iterate over a list
for item in ['a', 'b', 'c']:
  print(item)
>>>a
>>>b
>>>c

# Iterate with index
for i in range(5):
  print(f"Number: {i}")
>>>Number: 0
>>>Number: 1
>>>Number: 2
>>>Number: 3
>>>Number: 4

New cards

How do you create lists efficiently using list comprehensions?

# Basic list comprehension
squares = [x**2 for x in range(5)]
print(squares)
>>>[0, 1, 4, 9, 16]

# With condition
evens = [x for x in range(10) if x % 2 == 0]
print(evens)
>>>[0, 2, 4, 6, 8]

# Nested comprehension
matrix = [[i*j for j in range(3)] for i in range(3)]
print(matrix)
[[0, 0, 0], [0, 1, 2], [0, 2, 4]]

New cards

How do you define and call functions?

def greet(name, greeting='Hello'):
  return f'{greeting}, {name}!'

# Function calls
result = greet('Alice')
result2 = greet('Bob', 'Hi') # Specified a greeting other than the default 'Hello'

print(result)
>>>Hello, Alice!

print(result2)
>>>Hi, Bob!

New cards

How do you handle conditional statements?

x = 10
if x > 0: 
  print('positive')
elif x < 0: 
  print('negative')
else: 
  print('zero')

# Ternary operator
result = 'positive' if x > 0 else 'not positive'

>>>positive

New cards

How do you work with tuples?

# Create tuple
coords = (3, 5)
point = 1, 2, 3 # Parentheses optional

# Unpack tuple
x,y=coords
print(coords[0]) # Access by index
>>>3

# coords[0] = 10 # Error: tuples are immutable

New cards

How do you work with sets?

# Create set
unique_nums = {1, 2, 3, 3, 4} # Duplicates removed
my_set = set([1, 2, 3])

# Set operations
my_set.add(5)
print(my_set)
>>>{1, 2, 3, 5}

my_set.remove(1)
print(1 in my_set)
>>>False
print(my_set)
>>>{2, 3, 5}

New cards

How do you perform string operations?

text = "Hello World'

print(text.lower())
>>>hello world

print(text.upper())
>>>HELLO WORLD

print(text.split())
>>>['Hello', 'World']

print(text.replace('World', 'Python'))
>>>Hello Python

print(text.startswith('Hello')) 
>>>True

New cards

How do you use while loops?

count = 0
while count < 3: 
  print(f'Count is: {count}')
  count += 1
>>> Count is: 0
>>> Count is: 1
>>> Count is: 2

# With break and continue

New cards

How do you work with range()?

# Basic range

for i in range(5):    # 0, 1, 2, 3, 4
  print(i)

>>> 0
>>> 1
>>> 2
>>> 3
>>> 4


# Range with start, stop, step

for i in range(0, 10, 2):
  print(i)

>>> 0
>>> 2
>>> 4
>>> 6
>>> 8


# Convert to list

numbers = list(range(5))
print(numbers)

>>> [0, 1, 2, 3, 4]

New cards

How do you check if an item is in a list/dict?

fruits = ['apple', 'banana']
student = {'name': 'Alice', 'age': 22}

print('apple' in fruits)
>>> True

print('name' in student)
>>> True

print('grade' in student)
>>> False

New cards

How do you sort lists?

numbers = [3, 1, 4, 1, 5]

# In-place sorting
numbers.sort()
print(numbers)   
>>> [1, 1, 3, 4, 5]

# Return new sorted list
sorted_nums = sorted([3, 1, 4], reverse=True)
print(sorted_nums)
>>> [4, 3, 1]

New cards

How do you remove items from lists?

fruits = ['apple', 'banana', 'orange']

# Remove by value
fruits.remove('apple')
print(fruits)
>>> ['banana', 'orange']


# Remove by index
removed = fruits.pop(0)
print(fruits)
>>> ['banana', 'orange']


# Delete by index
del fruits[0]
print(fruits)
>>> ['banana', 'orange']

New cards

How do you get user input?

name = input('Enter your name: ')
age = int(input('Enter your age: '))
height = float(input('Enter height: '))

print(f'Hello {name}, you are {age} years old')

# Upon running, a text box will pop up for each option and user will enter values. For example, Name = Oliver, age = 89
>>> Hello Oliver, you are 89 years old

New cards

How do you convert between data types?

# String conversions to another data type
num_str = str(42)
>>> 42
num_int = int('42')
>>> 42
num_float = float('3.14')
>>> 3.14

# Collection conversions
my_list = list('hello')
print(my_list)
>>> ['h', 'e', 'l', 'l', 'o']

my_tuple=tuple([1, 2, 3])
>>> (1, 2, 3)

New cards

How do you work with multiple assignment?

# Multiple assignment
a, b = 1, 2
x = y = z = 0

# Swapping variables
a, b = b, a

# Unpacking
coords = (3, 5)
x, y = coords

print(coords)
>>> (3, 5)

New cards

How do you use enumerate()?

fruits = ['apple', 'banana', 'cherry']

# Get index and value
for i, fruit in enumerate(fruits):
  print(f'{i}: {fruit}')
>>> 0: apple
>>> 1: banana
>>> 2: cherry

# Start counting from 1
for i, fruit in enumerate(fruits, 1):
  print(f'{i}: {fruit}')
>>> 1: apple
>>> 2: banana
>>> 3: cherry

New cards

How do you use zip()?

names = [‘Alice’, ‘Bob’, ‘Charlie’]
ages = [25, 30, 35]
cities = [‘NYC’, ‘LA’, ‘Chicago’]


# Combine multiple lists
for name, age, city in zip(names, ages, cities):
  print(f'{name}, {age}, {city}')
>>> Alice, 25, NYC
>>> Bob, 30, LA
>>> Charlie, 35, Chicago

New cards

How do you create NumPy arrays?

import numpy as np

# From list
arr = np.array([1, 2, 3])
print(arr)
>>> [1 2 3]

# Create special arrays
zeros = np.zeros((3, 4))   # 3x4 array of zeros
>>> [[0. 0. 0. 0.]
>>> [0. 0. 0. 0.]
>>> [0. 0. 0. 0.]]

ones = np.ones((2, 3))     # 2x3 array of ones
>>> [[1. 1. 1.]
>>> [1. 1. 1.]]

eye = np.eye(3)        # 3x3 identity matrix
>>> [[1. 0. 0.]
>>> [0. 1. 0.]
>>> [0. 0. 1.]]

New cards

How do you access NumPy array elements?

arr = np.array([[1,2,3], [4,5,6]])

# Basic indexing
print(arr[0])    # First row
print(arr[0,1])  # Element at row 0, col 1
print(arr[:, 1]) #  All rows, col 1
print(arr[1:3])  # Rows 1 to 2

New cards

How do you perform mathematical operations on arrays?

a = np.array([1,2,3])
b = np.array(4,5,6])

# Element-wise operations
print(a + b)      # [5, 7, 9]
print(a * b)      # [4, 10, 18]
print(np.sqrt(a)) # Square root
print(np.exp(a))  # Exponential

New cards

How do you reshape NumPy arrays?

arr = np.array([a,2,3,4,5,6])

# Reshape to 2D
reshaped = arr.reshape(2,3)   # 2 rows, 3 cols

# Flatten to 1D
flat = reshaped.flatten()

# Transpose
transposed = reshaped.T

New cards

How do you find array statistics?

arr = np.array([1,2,3,4,5])

print(np.mean(arr))   # Mean 3.0
print(np.std(arr))    # Standard deviation
print(np.min(arr))    # Minimum: 1
print(np.max(arr))    # Maximum: 5
print(np.median(arr)) # Median: 3.0
print(np.sum(arr))    # Sum: 15

New cards

How do you create arrays with specific values?

# Arrays with specific values
zeros = np.zeros(5)             # [0,0,0,0,0}
ones = np.ones(3)               # [1,1,1]
full = np.full(4, 7)            # [7,7,7,7]
range_arr = np.arange(0, 10, 2) # [0,2,4,6,8]
linspace = np.linspace(0, 1, 5) # 5 points from 0 to 1

New cards

How do you perform boolean indexing?

arr = np.array([1, 2, 3, 4, 5, 6])

# Boolean conditions
print(arr > 3)      # (False, False, False, True, True, True)
print(arr[arr > 3]) # [4,5,6]

# Multiple conditions
print(arr[(arr > 2) & (arr < 6)])   # [3,4,5]

New cards

How do you concatenate arrays?

arr1 = np.array([a,2,3])
arr2 = np.array([4,5,6])

# Concatenate 1D arrays
result = np.concantenate([arr1, arr2])   # [1,2,3,4,5,6]

# Stack arrays
vstack = np.vstack([arr1, arr2])   # Vertical stack
hstack = np.hstack([arr1, arr2])   # Horizontal stack

New cards

How do you perform matrix multiplication?

A = np.array([[1,2], [3,4]])
B = np.array([[5,6], [7,8]])

# Matrix multiplication
result1 = np.dot(A, B)
result2 = A @ B       # Python 3.5+
result3 = np.matmul(A,B)

# Element-wise multiplication
result4 = A * B

New cards

How do you find unique values in arrays?

arr = np.array([1,2,2,3,3,3,4])

# Unique values
unique_vals = np.unique(arr)   # [1,2,3,4]

#Unique values with counts
unique_vals, counts = np.unique(arr, return_counts=True)
print(unique_vals)             # [1,2,3,4]
print(counts)                  # [1,2,3,1]

New cards

How do you save and load NumPy arrays?

arr = np.array([1,2,3,4,5])

# Save a single array
np.save('my_array.npy', arr)
loaded_arr = np.load('my_array.npy')

# Save multiple arrays
np.savez('arrays.npz', a=arr, b=arr*2)
data = np.load('arrays.npz')
print(data['a'], data['b'])

New cards

How do you perform broadcasting?

# Broadcasting allows operations on arrays of different shapes
arr = np.array([[1,2,3], [4,5,6]])

# Add scalar (broadcasts to all elements
result1 = arr + 10

# Add 1D array to 2D array
vec = np.array([1,2,3])
result2 = arr + vec     # Adds vec to each row

New cards

How do you work with random numbers?

# Set seed for reproducibility
np.random.seed(42)

# Random arrays
rand_uniform = np.random.rand(3,4)   # Uniform [0,1)
rand_normal = np.random.randn(3,4)   # Standard normal
rand_int = np.random.randint(0,10,5) # Random integers

New cards

How do you perform array comparison?

arr1 = np.array([1,2,3])
arr2 = np.array([1,2,4])

# Element-wise comparison
print(arr1 == arr2)    # [True, True, False]

# Array equality
print(np.array_equal(arr1, arr2))  # False

# All/any conditions
print(np.all(arr1 > 0))  # True
print(any(arr1 > 2))     # True

New cards

How do you work with NaN values in Numpy?

arr = np.array([1,2,np.nan,4,np.nan])

# Check for NaN
print(np.isnan(arr))   # [False, False, True, False, True]

# NaN-aware functions
print(np.nanmean(arr))  # Mean ignoring NaN
print(np.nansum(arr))   # Sum ignoring NaN

# Remove NaN values
clean_arr = arr[~np.isnan(arr)]

New cards

How do you create a pandas DataFrame?

import pandas as pd

# From dictionary
df = pd.DataFrame({
  'name': ['Alice', 'Bob', 'Charlie'], 
  'age': [25, 30, 35], 
  'city': ['NYC', 'LA', 'Chicago']
})

# From lists
df2 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

New cards

How do you read a csv file?

# Basic CSV reading
df = pd.read_csv('file.csv')

# With options
df = pd.read_csv('file.csv',
  index_col=0,       # Use first column as index
  sep=';',           # Different separator
  encoding='utf-8',  # Specify encoding
  na_values=['N/A']) # Custom missing values

New cards

How do you select data from DataFrames?

# Column selection
df['name']           # Single column (Series)
df[['name', 'age']]  # Multiple columns (DataFrame)

# Row selection 
df.loc[0]            # By label
df.iloc[0]           # By position
df.loc[0:2]          # Multiple rows

# Boolean indexing
df[df['age'] > 25]   # Filter rows

New cards

How do you handle missing data?

# Check for missing data
print(df.isnull().sum())     # Count missing values per column
print(df.info())             # Data types and non-null counts

# Handle missing data
df_clean = df.dropna()          # Drop rows with NaN
df_filled = df.fillna(0)        # Fill rows with 0
df['age'].fillna(df['age'].mean())  # Fill with mean

New cards

How do you perform GroupBy operations?

# Basic groupby
grouped = df.groupby('city')
print(grouped['age'].mean())   # Mean age by city

# Multiple aggregations
result = df.groupby('city')['age'].agg(['mean','std','count'])

# Group by multiple columns
result2 = df.groupby(['city','gender'])['salary'].sum()

New cards

How do you filter DataFrames?

# Single condition
young = df[df['age'] < 30]

# Multiple conditions
filtered = df[(df['age'] > 25) & (df['city'] == 'NYC')]

# Using query method
result = df.query('age > 25 and city == "NYC"')

# String contains
name_filter = df[df['name'].str.contains('A')]

New cards

How do you add a new column?

# Simple assignment
df['age_squared'] = df['age'] ** 2

# Based on conditions
df['age_group'] = df['age'].apply(lambda x: 'young' if x < 30 else 'old')

# Multiple columns at once
df[['total', 'average']] = df[['col1', 'col2']].apply(lambda x: [x.sum(), x.mean()], axis=1, result_type='expand')

New cards

How do you rename columns?

# Rename specific columns
df_renamed = df.rename(columns={'old_name': 'new_name'})

# Rename all columns
df.columns = ['col1', 'col2', 'col3']

# Using str methods
df.columns = df.columns.str.lower().str.replace(' ','_')

New cards

How do you drop columns or rows?

# Drop columns
df_no_age = df.drop('age', axis=1)
df_subset = df.drop(['age', 'city'], axis=1)

# Drop rows
df_no_first = df.drop(0, axis=0)  # Drop first row
df_clean = df.drop([0, 1, 2])     # Drop multiple rows

# Alternative: del statement
del df['column_name']

New cards

How do you sort DataFrames?

# Sort by single column
df_sorted = df.sort_values('age')

# Sort by multiple columns
df_sorted = df.sort_values(['city', 'age'], ascending=[True, False])

# Sort by index
df_sorted = df.sort_index()

# In-place sorting
df.sort_values('age', inplace=True)

New cards

How do you merge DataFrames?

df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

# Inner join (default)
inner = pd.merge(df1, df2, on='key')

# Left join
left = pd.merge(df1, df2, on='key', how='left')

# Different column names
merged = df1.merge(df2, left_on='id', right_on='user_id')

New cards

How do you concatenate DataFrames?

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

# Vertical concatenation (stack rows)
vertical - pd.concat([df1, df2])

# Horizontal concatenation (side by side)
horizontal = pd.concat([df1, df2], axis=1)

#  With keys
with_keys = pd.concat([df1, df2], keys=['first', 'second'])

New cards

How do you pivot DataFrames?

# Sample data
data = {'date': ['2023-01-01', '2023-01-01', '2023-01-02'],
  'variable': ['A', 'B', 'A'], 
  'value': [1, 2, 3]}
df = pd.DataFrame(data)

# Pivot table
pivoted = df.pivot(index='date', columns='variable', values='value')

# Pivot with aggregation
pivot_table = df.pivot_tab;e(index='date', columns='variable', values='value', aggfunc='mean')

New cards

How do you work with datetime data?

# Convert to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract date components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_name'] = df['date']dt.day_name()

# Date arithmetic
df['days_ago'] = (pd.Timestamp.now() - df['date']).dt.days

New cards

How do you apply functions to DataFrames?

# Apply to entire DataFrame
df_squared = df.apply(lambda x: x**2) # Numerif columns only

# Apply to specific column
df['name_length'] = df['name'].apply(len)
df['name_upper'] = df['name'].apply(str.upper)

# Apply custom function
def categorize_age(age): 
  return 'young' if age < 30 
  else 'old'

df['category'] = df['age'].apply(categorize_age)

New cards

How do you get DataFrame info and statistics?

# Basic info
print(df.info())       # Data types, memory usage
print(df.describe())   # Statistical summary
print(df.shape)        # (rows, columns)
print(df.dtypes)       # Column data types
print(df.columns)      # Column names
print(df.head())       # First 5 rows
print(df.tail())       # Last 5 rows

New cards

How do you handle duplicates?

# Check for duplicates
print(df.duplicated().sum())   # Count duplicates
print(df.duplicated(['name'])) # Duplicates in specific column

# Remove duplicates
df_unique = df.drop_duplicates()
df_unique = df.drop_duplicates(['name'], keep='first')

# Mark duplicates
df['is_duplicate'] = df.duplicated()

New cards

How do you reset index?

# Reset index (old index becomes column)
df_reset = df.reset_index()

# Reset and drop old index
df_reset = df.reset_index(drop=True)

# In-place reset
df.reset_index(drop=True, inplace=True)

New cards

How do you set index?

# # Set single column as index
df_indexed = df.set_index('name')

# Set multiple columns as index (MultiIndex)
df_multi = df.set_index(['city', 'name'])

# In-place index setting
df.set_index('name', inplace=True)

New cards

How do you save DataFrames to CSV?

# Basic CSV export
df.to_csv('output.csv', index=False)

# With options
df.to_csv('output.csv',
          index=False,           # Don't include index
          sep=';',              # Different separator
          encoding='utf-8',     # Specify encoding
          na_rep='Missing')     # How to represent NaN

New cards

How do you work with string columns?

# String operations
df['name_upper'] = df['name'].str.upper()
df['name_lower'] = df['name'].str.lower()
df['name_length'] = df['name'].str.len()

# String contains
filtered = df[df['name'].str.contains('A', na=False)]

# String split
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

New cards

How do you create cross-tabulations?

# Simple crosstab
crosstab = pd.crosstab(df['city'], df['age_group'])

# With percentages
crosstab_pct = pd.crosstab(df['city'], df['age_group'], normalize='index')

# With margins (totals)
crosstab_margins = pd.crosstab(df['city'], df['age_group'], margins=True)

New cards

How do you sample data?

# Random sample of n rows
sample_n = df.sample(n=5)

# Random sample of fraction
sample_frac = df.sample(frac=0.1)  # 10% of data

# Sample with replacement
sample_replace = df.sample(n=100, replace=True)

# Set seed for reproducibility
sample_seed = df.sample(n=5, random_state=42)

New cards

How do you replace values?

# Replace specific values
df_replaced = df.replace('old_value', 'new_value')

# Replace multiple values
df_replaced = df.replace({'A': 1, 'B': 2, 'C': 3})

# Replace in specific column
df['grade'] = df['grade'].replace({'A': 'Excellent', 'B': 'Good'})

# Replace using regex
df['text'] = df['text'].str.replace(r'\d+', 'NUMBER', regex=True)

New cards

How do you work with categorical data?

# Convert to categorical
df['category'] = df['category'].astype('category')

# Create dummy variables
dummies = pd.get_dummies(df['category'])

# Label encoding (manual)
df['category_code'] = df['category'].cat.codes

# One-hot encoding with prefix
dummies_prefix = pd.get_dummies(df['category'], prefix='cat')

New cards

How do you create a basic line plot?

import matplotlib.pyplot as plt

# Basic line plot
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

plt.plot(x, y)
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('My Line Plot')
plt.grid(True)
plt.show()

New cards

How do you create a scatter plot?

import matplotlib.pyplot as plt
import numpy as np

# Basic scatter plot
x = np.random.randn(100)
y = np.random.randn(100)
colors = np.random.rand(100)
sizes = 1000 * np.random.rand(100)

plt.scatter(x, y, c=colors, s=sizes, alpha=0.6)
plt.colorbar()
plt.show()

New cards

How do you create a bar chart?

import matplotlib.pyplot as plt

# Vertical bar chart
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]

plt.bar(categories, values, color='skyblue')
plt.ylabel('Values')
plt.title('Bar Chart')

# Horizontal bar chart
plt.barh(categories, values)
plt.show()

New cards

How do you create a histogram?

import matplotlib.pyplot as plt
import numpy as np

# Generate sample data
data = np.random.normal(100, 15, 1000)

# Create histogram
plt.hist(data, bins=30, alpha=0.7, color='green', edgecolor='black')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

New cards

How do you customize plot appearance?

import matplotlib.pyplot as plt

# Set figure size and style
plt.figure(figsize=(10, 6))
plt.style.use('seaborn-v0_8')

# Plot with customization
plt.plot(x, y, color='red', linewidth=2, linestyle='--', marker='o')
plt.title('Customized Plot', fontsize=16, fontweight='bold')
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

New cards

How do you create subplots?

import matplotlib.pyplot as plt

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Plot in different subplots
axes[0, 0].plot(x, y)
axes[0, 0].set_title('Line Plot')

axes[0, 1].scatter(x, y)
axes[0, 1].set_title('Scatter Plot')

axes[1, 0].bar(categories, values)
axes[1, 0].set_title('Bar Chart')

axes[1, 1].hist(data)
axes[1, 1].set_title('Histogram')

plt.tight_layout()
plt.show()

New cards

How do you create a heatmap with seaborn?

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Create sample correlation matrix
data = np.random.rand(10, 10)
corr_matrix = np.corrcoef(data)

# Create heatmap
sns.heatmap(corr_matrix, 
            annot=True,          # Show values
            cmap='coolwarm',     # Color scheme
            center=0,            # Center colormap at 0
            square=True)         # Square cells
plt.title('Correlation Heatmap')
plt.show()

New cards

How do you create a box plot?

import seaborn as sns
import matplotlib.pyplot as plt

# Seaborn box plot
sns.boxplot(data=df, x='category', y='value')
plt.title('Box Plot by Category')
plt.xticks(rotation=45)
plt.show()

# Matplotlib box plot
data_groups = [group['value'].values for name, group in df.groupby('category')]
plt.boxplot(data_groups, labels=df['category'].unique())
plt.show()

New cards

How do you create a violin plot?

import seaborn as sns
import matplotlib.pyplot as plt

# Basic violin plot
sns.violinplot(data=df, x='category', y='value')
plt.title('Violin Plot')
plt.show()

# Split violin plot (compare two groups)
sns.violinplot(data=df, x='category', y='value', hue='group', split=True)
plt.show()

New cards

How do you save plots?

import matplotlib.pyplot as plt

# Create plot
plt.plot(x, y)
plt.title('My Plot')

# Save with different formats and options
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.savefig('plot.pdf', bbox_inches='tight')
plt.savefig('plot.jpg', dpi=150, facecolor='white')

# Save without displaying
plt.savefig('plot.png')
plt.close()  # Close figure to free memory

New cards

How do you create a correlation matrix plot?

import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlation matrix
corr = df.select_dtypes(include=[np.number]).corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr, 
            annot=True, 
            cmap='RdBu_r', 
            center=0,
            square=True,
            linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

New cards

How do you create a pair plot?

import seaborn as sns

# Pair plot of all numeric columns
sns.pairplot(df)
plt.show()

# Pair plot with grouping
sns.pairplot(df, hue='species', markers=['o', 's', 'D'])
plt.show()

# Pair plot with specific columns
sns.pairplot(df, vars=['col1', 'col2', 'col3'], hue='target')
plt.show()

New cards

How do you add legends to plots?

import matplotlib.pyplot as plt

# Plot multiple lines
plt.plot(x, y1, label='Line 1', color='blue')
plt.plot(x, y2, label='Line 2', color='red')
plt.plot(x, y3, label='Line 3', color='green')

# Add legend
plt.legend(loc='upper right')  # or 'best', 'lower left', etc.
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Outside plot
plt.show()

New cards

How do you create a pie chart?

import matplotlib.pyplot as plt

# Data for pie chart
labels = ['Python', 'Java', 'JavaScript', 'C++']
sizes = [30, 25, 20, 15]
explode = (0.1, 0, 0, 0)  # Explode first slice

# Create pie chart
plt.pie(sizes, labels=labels, explode=explode, autopct='%1.1f%%', 
        shadow=True, startangle=90)
plt.axis('equal')  # Equal aspect ratio
plt.title('Programming Languages Usage')
plt.show()

New cards

How do you plot time series data?

import matplotlib.pyplot as plt
import pandas as pd

# Create time series data
dates = pd.date_range('2023-01-01', periods=100, freq='D')
values = np.cumsum(np.random.randn(100))

# Plot time series
plt.figure(figsize=(12, 6))
plt.plot(dates, values)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Time Series Plot')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

New cards

How do you split data for machine learning?

from sklearn.model_selection import train_test_split

# Basic train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Stratified split (maintains class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

New cards

How do you create a linear regression model?

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}, R²: {r2}')

New cards

How do you evaluate model performance?

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Classification metrics
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)

# Regression metrics
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mse)

New cards

How do you scale features?

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Standard scaling (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Min-Max scaling (0 to 1)
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X_train)

# Robust scaling (median-based)
robust_scaler = RobustScaler()
X_robust = robust_scaler.fit_transform(X_train)

New cards

How do you encode categorical variables?

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Label encoding (ordinal)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encoding
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X_categorical)

# Using pandas
dummies = pd.get_dummies(df['category'], prefix='cat')
df_encoded = pd.concat([df, dummies], axis

New cards

How do you perform cross-validation?

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Basic cross-validation
model = LogisticRegression()
scores = cross_val_score(model, X, y, cv=5)
print(f'CV Scores: {scores}')
print(f'Mean CV Score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf)

New cards

How do you create a classification model?

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Support Vector Machine
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

New cards

How do you handle imbalanced datasets?

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# Resampling minority class
minority_upsampled = resample(minority_class, 
                             replace=True, 
                             n_samples=len(majority_class))

# SMOTE (Synthetic Minority Oversampling)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Class weights
clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X_train, y_train)

New cards

How do you perform feature selection?

from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

# Univariate feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Recursive Feature Elimination
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

# Feature importance from tree-based models
rf.fit(X, y)
importances = rf.feature_importances_

New cards

How do you create a pipeline?

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit and predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Access individual steps
scaler = pipe.named_steps['scaler']
classifier = pipe.named_steps['classifier']

New cards

How do you tune hyperparameters?

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f'Best params: {grid_search.best_params_}')

New cards

How do you create a confusion matrix?

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print(classification_report(y_true, y_pred))

New cards

How do you perform clustering?

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import matplotlib.pyplot as plt

# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_

# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
db_labels = dbscan.fit_predict(X)

# Hierarchical clustering
agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clustering.fit_predict(X)

# Visualize clusters
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)
plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, c='red')
plt.show()

New cards

How do you reduce dimensionality?

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Principal Component Analysis
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

# t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Visualize reduced dimensions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.title('PCA')
plt.subplot(1, 2, 2)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
plt.title('t-SNE')
plt.show()

New cards

How do you handle overfitting?

# Techniques to handle overfitting:

# 1. Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)

# 2. Regularization
from sklearn.linear_model import Ridge, Lasso
ridge = Ridge(alpha=1.0)  # L2 regularization
lasso = Lasso(alpha=1.0)  # L1 regularization

# 3. Early stopping (for neural networks)
# 4. Dropout (for neural networks)
# 5. Ensemble methods
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)

# 6. More training data
# 7. Feature selection
# 8. Reduce model complexity

New cards

How do you create lambda functions?

# Lambda functions (anonymous functions)
square = lambda x: x**2
print(square(5))  # 25

# With multiple arguments
add = lambda x, y: x + y
print(add(3, 4))  # 7

# Using with built-in functions
numbers = [1, 2, 3, 4, 5]
squared = list(map(lambda x: x**2, numbers))
evens = list(filter(lambda x: x % 2 == 0, numbers))

# In pandas
df['new_col'] = df['old_col'].apply(lambda x: x * 2)

New cards

How do you handle exceptions?

# Basic try-except
try:
    result = 10 / 0
except ZeroDivisionError:
    print('Cannot divide by zero!')

# Multiple exceptions
try:
    value = int(input('Enter number: '))
    result = 10 / value
except ValueError:
    print('Invalid input!')
except ZeroDivisionError:
    print('Cannot divide by zero!')
finally:
    print('This always executes')

# Generic exception handler
try:
    risky_operation()
except Exception as e:
    print(f'Error occurred: {e}')

New cards

How do you read and write files?

# Reading files
with open('file.txt', 'r') as f:
    content = f.read()          # Read entire file
    lines = f.readlines()       # Read all lines
    first_line = f.readline()   # Read one line

# Writing files
with open('output.txt', 'w') as f:
    f.write('Hello, World!')
    f.writelines(['line1\n', 'line2\n'])

# Appending to files
with open('log.txt', 'a') as f:
    f.write('New log entry\n')

# Working with CSV
import csv
with open('data.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

New cards

How do you use map() and filter()?

# map() applies function to all items
numbers = [1, 2, 3, 4, 5]
squared = list(map(lambda x: x**2, numbers))    # [1, 4, 9, 16, 25]
strings = list(map(str, numbers))               # ['1', '2', '3', '4', '5']

# filter() filters items based on condition
evens = list(filter(lambda x: x % 2 == 0, numbers))  # [2, 4]
positive = list(filter(lambda x: x > 0, [-1, 0, 1, 2]))  # [1, 2]

# Multiple iterables with map
list1 = [1, 2, 3]
list2 = [4, 5, 6]
sums = list(map(lambda x, y: x + y, list1, list2))  # [5, 7, 9]

New cards

How do you create decorators?

# Simple decorator
def my_decorator(func):
    def wrapper(*args, **kwargs):
        print('Before function call')
        result = func(*args, **kwargs)
        print('After function call')
        return result
    return wrapper

# Using decorator
@my_decorator
def greet(name):
    return f'Hello, {name}!'

# Decorator with arguments
def repeat(times):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for _ in range(times):
                result = func(*args, **kwargs)
            return result
        return wrapper
    return decorator

@repeat(3)
def say_hello():
    print('Hello!')

New cards

How do you work with generators?

# Generator function
def fibonacci():
    a, b = 0, 1
    while True:
        yield a
        a, b = b, a + b

# Using generator
fib = fibonacci()
for _ in range(10):
    print(next(fib))

# Generator expression
squares = (x**2 for x in range(10))
print(list(squares))  # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

# Generator for memory efficiency
def read_large_file(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield line.strip()

New cards

How do you use args and *kwargs?

# *args for variable positional arguments
def sum_all(*args):
    return sum(args)

print(sum_all(1, 2, 3, 4))  # 10

# **kwargs for variable keyword arguments
def print_info(**kwargs):
    for key, value in kwargs.items():
        print(f'{key}: {value}')

print_info(name='Alice', age=30, city='NYC')

# Combining both
def flexible_function(*args, **kwargs):
    print(f'Args: {args}')
    print(f'Kwargs: {kwargs}')

flexible_function(1, 2, 3, name='Alice', age=30)

# Unpacking arguments
numbers = [1, 2, 3, 4]
print(sum_all(*numbers))  # Unpacking list

info = {'name': 'Bob', 'age': 25}
print_info(**info)  # Unpacking dictionary

New cards

How do you create classes?

# Basic class
class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def greet(self):
        return f'Hi, I am {self.name}'
    
    def have_birthday(self):
        self.age += 1
        return f'Happy birthday! Now {self.age}'

# Inheritance
class Student(Person):
    def __init__(self, name, age, student_id):
        super().__init__(name, age)
        self.student_id = student_id
    
    def study(self, subject):
        return f'{self.name} is studying {subject}'

# Using classes
person = Person('Alice', 25)
student = Student('Bob', 20, 'S12345')
print(person.greet())
print(student.study('Python'))

New cards

How do you work with JSON data?

import json

# Python dict to JSON string
data = {'name': 'Alice', 'age': 30, 'city': 'NYC'}
json_string = json.dumps(data, indent=2)
print(json_string)

# JSON string to Python dict
json_data = '{"name": "Bob", "age": 25}'
python_dict = json.loads(json_data)
print(python_dict)

# Save to JSON file
with open('data.json', 'w') as f:
    json.dump(data, f, indent=2)

# Load from JSON file
with open('data.json', 'r') as f:
    loaded_data = json.load(f)
    print(loaded_data)

# Handle datetime objects
from datetime import datetime
data_with_date = {'name': 'Alice', 'timestamp': datetime.now()}
# Need custom serializer for datetime

100

New cards

How do you measure execution time?

import time
from timeit import timeit

# Using time module
start_time = time.time()
# Your code here
time.sleep(1)  # Simulate work
end_time = time.time()
execution_time = end_time - start_time
print(f'Execution time: {execution_time:.2f} seconds')

# Using timeit for small code snippets
execution_time = timeit('sum(range(100))', number=10000)
print(f'Average time: {execution_time/10000:.6f} seconds')

# Context manager for timing
from contextlib import contextmanager

@contextmanager
def timer():
    start = time.time()
    yield
    end = time.time()
    print(f'Elapsed time: {end - start:.2f} seconds')

with timer():
    # Your code here
    sum(range(1000000))