By the end of this week you will be able to:
A list is an ordered, mutable (changeable) collection enclosed in square brackets. Lists are the most versatile data structure in Python and the foundation of most data processing pipelines before you load data into pandas.
# Creating lists
exam_scores = [72, 88, 65, 91, 78, 55, 83, 90, 67, 74]
student_names = ["Amara", "Emeka", "Ngozi", "Tunde", "Aisha"]
mixed_list = [1, "two", 3.0, True, None] # lists can mix types
# Length
print(f"Number of students: {{len(exam_scores)}}") # 10
# Indexing - Python counts from 0
print(exam_scores[0]) # 72 (first item)
print(exam_scores[-1]) # 74 (last item - negative indexing)
print(exam_scores[2]) # 65 (third item)
# Slicing: list[start:stop:step] - stop is EXCLUDED
print(exam_scores[0:5]) # [72, 88, 65, 91, 78] first 5
print(exam_scores[5:]) # [55, 83, 90, 67, 74] from index 5 to end
print(exam_scores[:3]) # [72, 88, 65] first 3
print(exam_scores[::2]) # [72, 65, 78, 83, 67] every other item
print(exam_scores[::-1]) # reversed list# Key list methods
scores = [72, 88, 65, 91, 78]
scores.append(84) # add one item to end → [72, 88, 65, 91, 78, 84]
scores.extend([77, 95]) # add multiple items → [..., 84, 77, 95]
scores.insert(0, 100) # insert at index 0 → [100, 72, 88, ...]
scores.remove(65) # remove first occurrence of 65
popped = scores.pop() # remove and return last item
popped2 = scores.pop(0) # remove and return item at index 0
scores.sort() # sort in place (ascending)
scores.sort(reverse=True) # sort descending
sorted_copy = sorted(scores) # returns NEW sorted list, original unchanged
print(f"Max score: {{max(scores)}}")
print(f"Min score: {{min(scores)}}")
print(f"Total: {{sum(scores)}}")
print(f"Mean: {{sum(scores)/len(scores):.1f}}")
print(f"Score 88 appears {{scores.count(88)}} times")
print(f"Index of max: {{scores.index(max(scores))}}")A tuple is an ordered, immutable sequence enclosed in parentheses. Once created, its values cannot be changed. This makes tuples safer for data that should not change, such as geographic coordinates, RGB colour values, or database column definitions.
# Tuples for fixed data
ibadan_coords = (7.3775, 3.9470) # latitude, longitude
rgb_teal = (0, 196, 167) # CSS colour
student_record = ("P001", "Amara Okafor", "Statistics", 3.85)
# Unpacking - assign tuple elements to variables
lat, lon = ibadan_coords
print(f"Ibadan: {{lat}}°N, {{lon}}°E")
name, dept, gpa = ("Emeka", "Computer Science", 3.72)
# Tuples as dictionary keys (lists cannot be keys)
coordinates = {{
(7.3775, 3.9470): "Ibadan",
(6.5244, 3.3792): "Lagos",
(9.0579, 7.4951): "Abuja"
}}
# Why use tuple instead of list?
# 1. Signals to readers: this data should not change
# 2. Slightly faster access than lists
# 3. Can be used as dictionary keys
# 4. Functions can return multiple values as tuples
def get_stats(data):
return min(data), max(data), sum(data)/len(data) # returns tuple
low, high, avg = get_stats([72, 88, 65, 91])
print(f"Low: {{low}} High: {{high}} Avg: {{avg:.1f}}")A dictionary stores key-value pairs enclosed in curly braces. Keys must be unique and immutable (strings, numbers, or tuples). Dictionaries are optimised for fast lookup by key (O(1) average time complexity) and are the primary data structure for structured records before loading into pandas DataFrames.
# Creating a dictionary
patient = {{
"id": "P001",
"name": "Chinwe Eze",
"age": 34,
"diagnosis": "Type 2 Diabetes",
"hba1c": [7.2, 6.9, 7.5, 6.8], # values can be lists
"enrolled": True
}}
# Accessing values
print(patient["name"]) # Chinwe Eze
print(patient.get("blood_group", "Not recorded")) # safe access - no KeyError
# Modifying
patient["treatment"] = "Metformin" # add new key
patient["age"] = 35 # update existing key
# Checking existence
print("diagnosis" in patient) # True
print("weight" in patient) # False
# Iterating
for key, value in patient.items():
print(f" {{key:15s}}: {{value}}")
# Dictionary of dictionaries (nested)
hospital_db = {{
"P001": {{"name": "Chinwe", "age": 34, "ward": "A"}},
"P002": {{"name": "Emeka", "age": 52, "ward": "B"}},
"P003": {{"name": "Ngozi", "age": 28, "ward": "A"}},
}}
# Access nested value
print(hospital_db["P002"]["name"]) # Emeka
# Count patients per ward
ward_counts = {{}}
for pid, data in hospital_db.items():
ward = data["ward"]
ward_counts[ward] = ward_counts.get(ward, 0) + 1
print(ward_counts) # {{'A': 2, 'B': 1}}# Sets automatically eliminate duplicates
raw_regions = ["South-West", "North-Central", "South-West",
"South-East", "North-West", "South-West", "South-East"]
unique_regions = set(raw_regions)
print(f"Unique regions: {{len(unique_regions)}}")
print(unique_regions) # order is not guaranteed
# Membership testing (faster than list for large datasets)
print("South-West" in unique_regions) # True - O(1)
print("North-East" in unique_regions) # False
# Set operations
study_A = {{"P001", "P002", "P004", "P007", "P010"}}
study_B = {{"P002", "P005", "P007", "P011"}}
both = study_A & study_B # intersection: in A AND B
either = study_A | study_B # union: in A OR B
only_A = study_A - study_B # difference: in A but not B
print(f"In both studies: {{both}}")
print(f"In either study: {{len(either)}} participants")
print(f"Only in Study A: {{only_A}}")
# Use case: find common columns between two datasets
df1_cols = {{"id", "age", "income", "region", "outcome"}}
df2_cols = {{"id", "age", "credit_score", "region", "default"}}
shared = df1_cols & df2_cols
print(f"Shared columns: {{shared}}")A list comprehension creates a new list by applying an expression to each item in an iterable, with an optional filter condition. Comprehensions are more concise than equivalent for loops and are considered more "Pythonic" — they are standard in production data science code.
# Syntax: [expression for item in iterable if condition]
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# Squares of all numbers
squares = [n**2 for n in numbers]
# Even numbers only
evens = [n for n in numbers if n % 2 == 0]
# Squares of even numbers
even_squares = [n**2 for n in numbers if n % 2 == 0]
print(even_squares) # [4, 16, 36, 64, 100]
# Data science use: normalise scores to 0-1 range
scores = [55, 72, 88, 91, 64, 77]
min_s = min(scores)
max_s = max(scores)
normalised = [(s - min_s) / (max_s - min_s) for s in scores]
print([round(v, 3) for v in normalised])
# String processing: clean a list of region names
raw = [" Lagos ", "KANO", "abuja ", "Lagos", "kano"]
clean = [r.strip().title() for r in raw]
print(clean) # ["Lagos", "Kano", "Abuja", "Lagos", "Kano"]
# Dictionary comprehension
words = ["data", "science", "machine", "learning"]
word_lengths = {{word: len(word) for word in words}}
print(word_lengths) # {{"data": 4, "science": 7, ...}}# Reading a text file
# Always use "with" - it automatically closes the file even if an error occurs
with open("data.txt", "r") as f:
content = f.read() # entire file as one string
with open("data.txt", "r") as f:
lines = f.readlines() # list of strings, one per line
# Writing to a file ("w" overwrites; "a" appends)
results = ["72 - Pass", "88 - Distinction", "51 - Pass", "48 - Fail"]
with open("results.txt", "w") as f:
f.write("Exam Results\n")
f.write("="*30 + "\n")
for line in results:
f.write(line + "\n")
print("File written successfully")import csv
# Writing a CSV file
students = [
{{"name": "Amara Okafor", "score": 88, "grade": "Distinction"}},
{{"name": "Emeka Nwosu", "score": 74, "grade": "Merit"}},
{{"name": "Ngozi Adeyemi", "score": 61, "grade": "Pass"}},
{{"name": "Tunde Bakare", "score": 45, "grade": "Fail"}},
]
with open("students.csv", "w", newline="") as f:
fieldnames = ["name", "score", "grade"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() # write the column header row
writer.writerows(students) # write all data rows
# Reading a CSV file back
with open("students.csv", "r") as f:
reader = csv.DictReader(f) # each row is a dictionary
data = list(reader)
# Count grades
grade_counts = {{}}
for row in data:
g = row["grade"]
grade_counts[g] = grade_counts.get(g, 0) + 1
print(grade_counts)
# Average score
avg = sum(float(row["score"]) for row in data) / len(data)
print(f"Average score: {{avg:.1f}}")In Week 3 you will replace the csv module with pandas, which handles CSV files far more powerfully. Understanding the csv module first gives you insight into what pandas does under the hood.
Create a student_database dictionary where each key is a student ID (e.g. "STU001") and each value is a dictionary containing: name, age, scores (a list of 5 module marks), and course. Write three functions:
Test with at least 5 students. Push to GitHub.
Download any small CSV dataset from Kaggle (e.g. the Iris flower dataset or a simple sales dataset). Write a Python script that:
Your script must work correctly, have comments explaining each step, and be submitted to your GitHub repository.