# --- Import libraries ---
import pandas as pd              # pandas: load and manipulate tabular data (DataFrames)
import numpy as np               # numpy: numeric support (used indirectly by pandas/seaborn)
import matplotlib.pyplot as plt  # matplotlib: base engine for drawing charts
import seaborn as sns            # seaborn: high-level statistical charts on top of matplotlib

# --- Chart settings consistent with the Bellabeat brand ---
sns.set_theme(style="whitegrid")                 # light background grid theme for all charts
BELLA = ["#2A9D8F", "#E76F51", "#264653", "#E9C46A", "#8AB17D"]  # color palette (teal, coral, dark blue, gold, green)
sns.set_palette(BELLA)                           # set the palette as seaborn default
plt.rcParams["figure.figsize"] = (10, 5)         # default size of each figure (width, height in inches)
plt.rcParams["axes.titlesize"] = 13              # font size of axis titles
plt.rcParams["axes.titleweight"] = "bold"        # bold titles

# --- Load the CSV files into DataFrames ---
DATA = "Data/Fitabase Data 4.12.16-5.12.16/"     # folder containing the CSVs (path relative to the notebook)
activity = pd.read_csv(DATA + "dailyActivity_merged.csv")   # daily activity: steps, distances, minutes, calories
sleep    = pd.read_csv(DATA + "sleepDay_merged.csv")        # daily sleep: minutes asleep / in bed
weight   = pd.read_csv(DATA + "weightLogInfo_merged.csv")   # weight log: weight, BMI, fat (tiny sample)
hourly   = pd.read_csv(DATA + "hourlySteps_merged.csv")     # steps aggregated hour by hour

# .shape returns the (number of rows, number of columns) pair of each DataFrame
print("activity:", activity.shape, "| sleep:", sleep.shape,
      "| weight:", weight.shape, "| hourly:", hourly.shape)

activity: (940, 15) | sleep: (413, 5) | weight: (67, 8) | hourly: (22099, 3)

# --- How many distinct users are in each file? ---
print("== UNIQUE USERS ==")
for name, df in [("activity", activity), ("sleep", sleep), ("weight", weight), ("hourly", hourly)]:
    # df['Id'] = user identifier column; .nunique() counts distinct values
    print(f"{name:9s}: {df['Id'].nunique()} users")   # f-string: {name:9s} pads the name to 9 chars

# --- Are there fully duplicated rows? ---
print("\n== DUPLICATES ==")
for name, df in [("activity", activity), ("sleep", sleep), ("weight", weight)]:
    # .duplicated() marks repeated rows as True; .sum() counts them (True=1)
    print(f"{name:9s}: {df.duplicated().sum()} duplicate rows")

# --- Missing values in the activity file ---
print("\n== MISSING VALUES (activity) ==")
# .isna() builds a True mask where the value is missing; .sum() counts the missing per column
# [lambda s: s > 0] shows only columns that have at least one missing value
print(activity.isna().sum()[lambda s: s > 0] if activity.isna().sum().sum() else "None")
# How many values are missing in the weight file's Fat column (to confirm it is unusable)
print("\nFat missing in weight:", weight['Fat'].isna().sum(), "out of", len(weight))

== UNIQUE USERS ==
activity : 33 users
sleep    : 24 users
weight   : 8 users
hourly   : 33 users

== DUPLICATES ==
activity : 0 duplicate rows
sleep    : 3 duplicate rows
weight   : 0 duplicate rows

== MISSING VALUES (activity) ==
None

Fat missing in weight: 65 out of 67

# 1) Remove the fully duplicated rows in the sleep file (3 identical rows found above)
sleep = sleep.drop_duplicates()   # returns the DataFrame without duplicates, reassigned to 'sleep'

# 2) Convert dates from text to datetime objects, specifying the exact format (no month/day ambiguity)
activity['ActivityDate'] = pd.to_datetime(activity['ActivityDate'], format='%m/%d/%Y')                 # e.g. "4/12/2016"
sleep['SleepDay']       = pd.to_datetime(sleep['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')             # e.g. "4/12/2016 12:00:00 AM"
hourly['ActivityHour']  = pd.to_datetime(hourly['ActivityHour'], format='%m/%d/%Y %I:%M:%S %p')        # date + time

# 3) Create derived columns that the analysis will use
activity['Weekday']       = activity['ActivityDate'].dt.day_name()        # weekday name (Monday, ...)
activity['ActiveMinutes'] = (activity['VeryActiveMinutes']                # sum of active minutes across all intensities
                             + activity['FairlyActiveMinutes']
                             + activity['LightlyActiveMinutes'])
activity['SedentaryHours'] = activity['SedentaryMinutes'] / 60            # sedentary minutes converted to hours
hourly['Hour']            = hourly['ActivityHour'].dt.hour                # extract only the hour (0–23) from the timestamp
sleep['HoursAsleep']      = sleep['TotalMinutesAsleep'] / 60              # sleep minutes converted to hours
sleep['TimeInBedAwake']   = sleep['TotalTimeInBed'] - sleep['TotalMinutesAsleep']  # time spent in bed but awake

# 4) Identify days when the device was NOT worn: 1440 min = 24h all sedentary => likely non-wear
activity['NonWear'] = activity['SedentaryMinutes'] == 1440   # boolean True/False column
print("Days with device not worn (sedentary=1440):", activity['NonWear'].sum())  # count the True values
print("Days with 0 steps:", (activity['TotalSteps'] == 0).sum())                  # days with zero steps

# Create a 'clean' copy without non-wear days, so activity averages are not skewed
activity_valid = activity[~activity['NonWear']].copy()   # '~' inverts the mask: keeps only worn days
print("Total activity rows:", len(activity), "| valid (device worn):", len(activity_valid))

Days with device not worn (sedentary=1440): 79
Days with 0 steps: 77
Total activity rows: 940 | valid (device worn): 861

# .describe() computes count, mean, std deviation, min, quartiles and max for the chosen numeric columns
summary = activity_valid[['TotalSteps','TotalDistance','ActiveMinutes',
                          'SedentaryMinutes','Calories']].describe().round(1)  # .round(1) = 1 decimal place
display(summary)   # display() shows the formatted table in the notebook (better than print for DataFrames)

# Main averages, rounded, to read the typical user's values
print("Avg steps/day        :", round(activity_valid['TotalSteps'].mean()))      # .mean() = arithmetic mean
print("Avg calories/day     :", round(activity_valid['Calories'].mean()))
print("Avg sedentary hours  :", round(activity_valid['SedentaryHours'].mean(), 1))
print("Avg hours of sleep   :", round(sleep['HoursAsleep'].mean(), 2))
print("Avg min awake in bed :", round(sleep['TimeInBedAwake'].mean(), 1))

Avg steps/day        : 8280
Avg calories/day     : 2351
Avg sedentary hours  : 15.8
Avg hours of sleep   : 6.99
Avg min awake in bed : 39.3

# Merge activity and sleep on the same user+day, so they can be compared on the same row
merged = activity_valid.merge(sleep,                       # .merge() = SQL-style join
                              left_on=['Id','ActivityDate'],  # keys in the left DataFrame (activity)
                              right_on=['Id','SleepDay'],     # keys in the right DataFrame (sleep)
                              how='inner')                    # 'inner' = keep only days present in both
print("activity+sleep rows:", len(merged), "| users:", merged['Id'].nunique())

# .corr() computes the Pearson correlation coefficient (-1 to +1) between two columns
print("\nCorrelations:")
print("  Steps     ↔ Calories :", round(activity_valid['TotalSteps'].corr(activity_valid['Calories']), 3))
print("  Sedentary ↔ Sleep    :", round(merged['SedentaryMinutes'].corr(merged['TotalMinutesAsleep']), 3))
print("  Active min↔ Calories :", round(activity_valid['ActiveMinutes'].corr(activity_valid['Calories']), 3))

activity+sleep rows: 410 | users: 24

Correlations:
  Steps     ↔ Calories : 0.569
  Sedentary ↔ Sleep    : -0.601
  Active min↔ Calories : 0.446

# --- Usage frequency: how many distinct days each user wore the device ---
usage_days = activity[~activity['NonWear']].groupby('Id')['ActivityDate'].nunique()  # per user, count worn days
def segment(d):                              # function that classifies a user by days of use
    if d >= 21: return "High use (21–31 d)"     # device worn almost always
    if d >= 11: return "Moderate use (11–20 d)"  # intermediate use
    return "Low use (1–10 d)"                     # occasional use
seg_counts = usage_days.apply(segment).value_counts()   # apply the function to each user and count per category
print("User segmentation by usage frequency:\n", seg_counts.to_string())

# --- How many users actually track each metric? ---
print("\nUsers tracking activity:", activity['Id'].nunique(),
      "| sleep:", sleep['Id'].nunique(),
      "| weight:", weight['Id'].nunique())

# --- Hourly pattern: average steps for each hour of the day ---
hourly_avg = hourly.groupby('Hour')['StepTotal'].mean()   # group by hour and average the steps
# --- Weekly pattern: average steps per weekday, reordered Mon->Sun ---
order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
weekday_avg = activity_valid.groupby('Weekday')['TotalSteps'].mean().reindex(order)  # .reindex() forces the weekday order
print("\nPeak hours (avg steps):", hourly_avg.sort_values(ascending=False).head(3).round(0).to_dict())

User segmentation by usage frequency:
 ActivityDate
High use (21–31 d)        25
Moderate use (11–20 d)     7
Low use (1–10 d)           1

Users tracking activity: 33 | sleep: 24 | weight: 8

Peak hours (avg steps): {18: 599.0, 19: 583.0, 17: 550.0}

# Chart 1: relationship between steps and calories, with regression line
fig, ax = plt.subplots()                         # create figure (fig) and drawing area (ax)
sns.regplot(data=activity_valid, x='TotalSteps', y='Calories',   # scatter + trend line
            scatter_kws={'alpha':0.35, 's':25, 'color':BELLA[0]},  # semi-transparent points (alpha), size s, color
            line_kws={'color':BELLA[1], 'lw':2}, ax=ax)            # coral line, width lw=2
ax.set_title("More steps → more calories burned (r ≈ 0.59)")        # chart title
ax.set_xlabel("Daily steps"); ax.set_ylabel("Calories")            # X and Y axis labels
plt.tight_layout(); plt.show()                   # optimize margins and show the chart

# Chart 2: average steps by hour of day (bars), highlighting the 3 peak hours
fig, ax = plt.subplots()
hourly_avg.plot(kind='bar', color=BELLA[0], ax=ax)               # bar chart of average steps per hour
for h in hourly_avg.sort_values(ascending=False).head(3).index:  # for the 3 hours with most steps...
    ax.patches[h].set_color(BELLA[1])                            # ...color the matching bar coral
ax.set_title("Average steps by hour of day — peaks at lunch (12-2pm) and evening (5-7pm)")
ax.set_xlabel("Hour"); ax.set_ylabel("Avg steps")
plt.tight_layout(); plt.show()

# Chart 3: average steps by weekday with the mean line
fig, ax = plt.subplots()
weekday_avg.plot(kind='bar', color=BELLA[4], ax=ax)              # green bars: average steps per day
ax.axhline(weekday_avg.mean(), color=BELLA[1], ls='--', label='Weekly average')  # dashed horizontal line
ax.set_title("Average steps by day of week")
ax.set_xlabel(""); ax.set_ylabel("Avg steps"); ax.legend()      # legend for the mean line
plt.xticks(rotation=45, ha='right'); plt.tight_layout(); plt.show()  # rotate weekday labels by 45°

# Chart 4: average composition of the day by activity level (pie)
parts = activity_valid[['VeryActiveMinutes','FairlyActiveMinutes',
                        'LightlyActiveMinutes','SedentaryMinutes']].mean()   # average minutes per level
labels = ['Very active','Fairly active','Lightly active','Sedentary']        # slice labels
fig, ax = plt.subplots(figsize=(7,7))                                        # square figure for the pie
ax.pie(parts, labels=labels, autopct='%1.1f%%', startangle=90,               # autopct shows the percentage on each slice
       colors=[BELLA[1],BELLA[3],BELLA[4],BELLA[2]],                          # one color per slice
       wedgeprops={'edgecolor':'white'})                                     # white border between slices
ax.set_title("How the average user spends the day\n(79% of tracked time is sedentary)")
plt.tight_layout(); plt.show()

# Chart 5: relationship between sedentary hours and hours of sleep (negative correlation)
fig, ax = plt.subplots()
sns.regplot(data=merged, x='SedentaryHours', y='HoursAsleep',     # scatter + line on the merged dataset
            scatter_kws={'alpha':0.35,'s':25,'color':BELLA[2]},
            line_kws={'color':BELLA[1],'lw':2}, ax=ax)
ax.set_title("More sedentary hours → less sleep (r ≈ −0.60)")
ax.set_xlabel("Sedentary hours / day"); ax.set_ylabel("Hours asleep")
plt.tight_layout(); plt.show()

# Chart 6: number of users per usage-frequency band (horizontal bars)
fig, ax = plt.subplots(figsize=(8,5))
seg_counts.reindex(["High use (21–31 d)","Moderate use (11–20 d)","Low use (1–10 d)"]).plot(  # fixed band order
    kind='barh', color=[BELLA[0],BELLA[3],BELLA[1]], ax=ax)     # barh = horizontal bars, one color per band
ax.set_title("Device usage frequency — most users wear it almost every day")
ax.set_xlabel("Number of users")
plt.tight_layout(); plt.show()

Criterion	Rating	Notes
Reliable	⚠️ Low	Only ~30 users, small non-random sample
Original	⚠️ Medium	Third-party data collected via Amazon Mechanical Turk
Comprehensive	⚠️ Medium	Missing demographics (sex, age): critical for a women's brand
Current	❌ Low	Data from 2016, over 8 years old
Cited	✅	Documented source (Kaggle/Mobius)

	TotalSteps	TotalDistance	ActiveMinutes	SedentaryMinutes	Calories
count	861.0	861.0	861.0	861.0	861.0
mean	8280.3	5.9	248.4	950.0	2350.6
std	4783.2	3.7	104.9	280.9	713.9
min	0.0	0.0	0.0	0.0	52.0
25%	4832.0	3.3	184.0	720.0	1852.0
50%	7990.0	5.6	258.0	1019.0	2207.0
75%	11085.0	7.9	323.0	1187.0	2828.0
max	36019.0	28.0	552.0	1439.0	4900.0

Bellabeat Case Study — How Can a Wellness Company Play It Smart?¶

Google Data Analytics Capstone · Junior Data Analyst, Bellabeat Marketing Analytics Team¶

1. Ask¶

Business context¶

Business task¶

Guiding questions¶

Stakeholders¶

Deliverables¶

2. Prepare¶

Data source¶

Credibility assessment — ROCCC¶

Main limitations (to keep in mind for the recommendations)¶

3. Process¶

3.1 Initial inspection¶

3.2 Cleaning and transformation¶

4. Analyze¶

4.1 Descriptive statistics¶

4.2 Relationships between variables¶

4.3 When and how much people use the device¶

6. Act — Conclusions and recommendations¶

Chosen product: the Bellabeat app (with a focus on the Leaf/Time tracker)¶

Answers to the business questions¶

🎯 Top 3 recommendations¶

Next steps and limitations¶

Bellabeat Case Study — How Can a Wellness Company Play It Smart?¶

Google Data Analytics Capstone · Junior Data Analyst, Bellabeat Marketing Analytics Team¶

1. Ask¶

Business context¶

Business task¶

Guiding questions¶

Stakeholders¶

Deliverables¶

2. Prepare¶

Data source¶

Credibility assessment — ROCCC¶

Main limitations (to keep in mind for the recommendations)¶

3. Process¶

3.1 Initial inspection¶

3.2 Cleaning and transformation¶

4. Analyze¶

4.1 Descriptive statistics¶

4.2 Relationships between variables¶

4.3 When and how much people use the device¶

5. Share — Visualizations¶

6. Act — Conclusions and recommendations¶

Chosen product: the Bellabeat app (with a focus on the Leaf/Time tracker)¶

Answers to the business questions¶

🎯 Top 3 recommendations¶

Next steps and limitations¶