Feat Regional Analysis
#1
by SmileXing - opened
- .env.example +4 -0
- .gitignore +8 -0
- CHANGELOG.md +20 -0
- README.md +69 -12
- app.py +4 -3
- pyproject.toml +10 -0
- requirements.txt +1 -0
- src/leaderboard_analytics/__init__.py +0 -1
- src/leaderboard_analytics/config.py +6 -1
- src/leaderboard_analytics/db.py +3 -2
- src/leaderboard_analytics/geoip_database.py +36 -0
- src/leaderboard_analytics/main.py +32 -5
- src/leaderboard_analytics/repositories.py +294 -46
- src/leaderboard_analytics/schemas.py +11 -4
- src/leaderboard_analytics/services.py +220 -18
- src/leaderboard_analytics/ui.py +396 -63
- tests/test_geoip_database.py +29 -0
- tests/test_repositories.py +95 -0
- tests/test_schemas.py +16 -0
- tests/test_services.py +110 -0
.env.example
CHANGED
|
@@ -4,3 +4,7 @@ MONGO_COLLECTION=events
|
|
| 4 |
HOST=0.0.0.0
|
| 5 |
PORT=7860
|
| 6 |
GRADIO_SHARE=false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
HOST=0.0.0.0
|
| 5 |
PORT=7860
|
| 6 |
GRADIO_SHARE=false
|
| 7 |
+
GRADIO_SSR_MODE=false
|
| 8 |
+
GEOIP_DATABASE_PATH=GeoLite2-Country.mmdb
|
| 9 |
+
GEOIP_DATABASE_URL=https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz
|
| 10 |
+
GEOIP_AUTO_DOWNLOAD=true
|
.gitignore
CHANGED
|
@@ -49,6 +49,7 @@ coverage.xml
|
|
| 49 |
*.py.cover
|
| 50 |
.hypothesis/
|
| 51 |
.pytest_cache/
|
|
|
|
| 52 |
cover/
|
| 53 |
|
| 54 |
# Translations
|
|
@@ -144,6 +145,13 @@ ENV/
|
|
| 144 |
env.bak/
|
| 145 |
venv.bak/
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
# Spyder project settings
|
| 148 |
.spyderproject
|
| 149 |
.spyproject
|
|
|
|
| 49 |
*.py.cover
|
| 50 |
.hypothesis/
|
| 51 |
.pytest_cache/
|
| 52 |
+
.pytest_tmp/
|
| 53 |
cover/
|
| 54 |
|
| 55 |
# Translations
|
|
|
|
| 145 |
env.bak/
|
| 146 |
venv.bak/
|
| 147 |
|
| 148 |
+
# Local GeoIP databases
|
| 149 |
+
*.mmdb
|
| 150 |
+
*.mmdb.gz
|
| 151 |
+
|
| 152 |
+
# Local analytics exports
|
| 153 |
+
visitor_ips*.csv
|
| 154 |
+
|
| 155 |
# Spyder project settings
|
| 156 |
.spyderproject
|
| 157 |
.spyproject
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to this project will be documented in this file.
|
| 4 |
+
|
| 5 |
+
## Unreleased
|
| 6 |
+
|
| 7 |
+
### Added
|
| 8 |
+
|
| 9 |
+
- Added full-range overview totals so UV and Sessions are distinct counts across the selected range.
|
| 10 |
+
- Added ordered funnel logic that counts each step only when it occurs after the previous required step.
|
| 11 |
+
- Added benchmark choices, raw data tables, and CSV export support to the dashboard.
|
| 12 |
+
- Added query validation, MongoDB ping checks, and dashboard-friendly error messages.
|
| 13 |
+
- Added pytest coverage for metric totals, query validation, and MongoDB aggregation pipeline shape.
|
| 14 |
+
- Added CI for formatting, linting, and tests.
|
| 15 |
+
|
| 16 |
+
### Changed
|
| 17 |
+
|
| 18 |
+
- Updated new vs returning visitor logic to compute first-seen dates from the full available page-view history before applying the selected reporting range.
|
| 19 |
+
- Updated MongoDB aggregation pipelines to prefer an indexed `ts` Date field while retaining fallback support for legacy `timestamp` values.
|
| 20 |
+
- Documented recommended MongoDB indexes for production deployments.
|
README.md
CHANGED
|
@@ -23,8 +23,10 @@ The primary purpose of this document is to define **what is measured**, **where
|
|
| 23 |
All analytics are based on the `events` collection and the following stable fields:
|
| 24 |
|
| 25 |
- Core dimensions: `event_name`, `timestamp`, `session_id`
|
|
|
|
| 26 |
- Behavior context: `benchmark`, `filters`
|
| 27 |
- Visitor identity (approximate): `properties.visitor_id`
|
|
|
|
| 28 |
- Change context: `properties.old_value`, `properties.new_value`, `properties.filter_name`
|
| 29 |
|
| 30 |
Important event names:
|
|
@@ -51,7 +53,7 @@ Important event names:
|
|
| 51 |
- **Definition**: Number of unique interaction sessions.
|
| 52 |
- **Source fields**: `session_id`
|
| 53 |
- **Calculation**:
|
| 54 |
-
- Sessions = count of distinct `session_id` in the selected time range
|
| 55 |
|
| 56 |
### 3) UV (Unique Visitors, Approximate)
|
| 57 |
|
|
@@ -59,7 +61,7 @@ Important event names:
|
|
| 59 |
- **Source fields**: `properties.visitor_id`
|
| 60 |
- **Calculation**:
|
| 61 |
- Remove null/empty `properties.visitor_id`
|
| 62 |
-
- UV = count of distinct `properties.visitor_id`
|
| 63 |
|
| 64 |
### 4) Sessions Per Visitor
|
| 65 |
|
|
@@ -106,7 +108,7 @@ Important event names:
|
|
| 106 |
- **Source fields**: `event_name`, `session_id`
|
| 107 |
- **Calculation**:
|
| 108 |
- For each `filter_change_`* event type:
|
| 109 |
-
- collect distinct `session_id`
|
| 110 |
- coverage = distinct session count
|
| 111 |
|
| 112 |
---
|
|
@@ -122,12 +124,12 @@ Recommended session-level funnel:
|
|
| 122 |
|
| 123 |
### 9) Step Session Count
|
| 124 |
|
| 125 |
-
- **Definition**: Number of sessions that reached each funnel step.
|
| 126 |
-
- **Source fields**: `session_id`, `event_name`
|
| 127 |
- **Calculation**:
|
| 128 |
- Group events by `session_id`
|
| 129 |
-
-
|
| 130 |
-
- Count
|
| 131 |
|
| 132 |
### 10) Step Conversion Rate
|
| 133 |
|
|
@@ -144,10 +146,10 @@ Recommended session-level funnel:
|
|
| 144 |
### 11) New Visitors
|
| 145 |
|
| 146 |
- **Definition**: Visitors whose current period contains their first observed visit date.
|
| 147 |
-
- **Source fields**: `event_name`, `timestamp`, `properties.visitor_id`
|
| 148 |
- **Calculation**:
|
| 149 |
- Use `page_view` events only
|
| 150 |
-
- For each `visitor_id`, find earliest timestamp (`first_seen`)
|
| 151 |
- If event date equals `first_seen` date, classify as `new`
|
| 152 |
- Count distinct `visitor_id` by period
|
| 153 |
|
|
@@ -160,6 +162,19 @@ Recommended session-level funnel:
|
|
| 160 |
- If event date is later than first-seen date, classify as `returning`
|
| 161 |
- Count distinct `visitor_id` by period
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
---
|
| 164 |
|
| 165 |
## Time Aggregation Rules
|
|
@@ -170,10 +185,11 @@ All trend metrics support these granularities:
|
|
| 170 |
- `week` -> `%G-W%V` (ISO week)
|
| 171 |
- `month` -> `%Y-%m`
|
| 172 |
|
| 173 |
-
Time filtering
|
| 174 |
|
| 175 |
-
-
|
| 176 |
-
-
|
|
|
|
| 177 |
|
| 178 |
Optional benchmark filtering:
|
| 179 |
|
|
@@ -186,6 +202,27 @@ Optional benchmark filtering:
|
|
| 186 |
1. `visitor_id` is an approximate identifier, not a strict user identity.
|
| 187 |
2. For `filter_change_`*, `properties.new_value` may not always represent the actual final filter value; prefer `filters` snapshot for behavioral context.
|
| 188 |
3. If `table_download` is not instrumented, funnel step 4 will under-report by design.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
---
|
| 191 |
|
|
@@ -196,6 +233,19 @@ Only required runtime inputs:
|
|
| 196 |
- MongoDB connection URI (`MONGO_URI`)
|
| 197 |
- Mongo database/collection names (defaults supported)
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
Local commands:
|
| 200 |
|
| 201 |
```bash
|
|
@@ -203,3 +253,10 @@ uv sync
|
|
| 203 |
uv run leaderboard-analytics
|
| 204 |
```
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
All analytics are based on the `events` collection and the following stable fields:
|
| 24 |
|
| 25 |
- Core dimensions: `event_name`, `timestamp`, `session_id`
|
| 26 |
+
- Preferred event time: `ts` as a MongoDB Date
|
| 27 |
- Behavior context: `benchmark`, `filters`
|
| 28 |
- Visitor identity (approximate): `properties.visitor_id`
|
| 29 |
+
- Visitor IP for country analysis: `properties.ip`
|
| 30 |
- Change context: `properties.old_value`, `properties.new_value`, `properties.filter_name`
|
| 31 |
|
| 32 |
Important event names:
|
|
|
|
| 53 |
- **Definition**: Number of unique interaction sessions.
|
| 54 |
- **Source fields**: `session_id`
|
| 55 |
- **Calculation**:
|
| 56 |
+
- Sessions = count of distinct non-empty `session_id` values in the selected time range
|
| 57 |
|
| 58 |
### 3) UV (Unique Visitors, Approximate)
|
| 59 |
|
|
|
|
| 61 |
- **Source fields**: `properties.visitor_id`
|
| 62 |
- **Calculation**:
|
| 63 |
- Remove null/empty `properties.visitor_id`
|
| 64 |
+
- UV = count of distinct `properties.visitor_id` values in the selected time range
|
| 65 |
|
| 66 |
### 4) Sessions Per Visitor
|
| 67 |
|
|
|
|
| 108 |
- **Source fields**: `event_name`, `session_id`
|
| 109 |
- **Calculation**:
|
| 110 |
- For each `filter_change_`* event type:
|
| 111 |
+
- collect distinct non-empty `session_id`
|
| 112 |
- coverage = distinct session count
|
| 113 |
|
| 114 |
---
|
|
|
|
| 124 |
|
| 125 |
### 9) Step Session Count
|
| 126 |
|
| 127 |
+
- **Definition**: Number of sessions that reached each ordered funnel step.
|
| 128 |
+
- **Source fields**: `session_id`, `event_name`, `ts` or `timestamp`
|
| 129 |
- **Calculation**:
|
| 130 |
- Group events by `session_id`
|
| 131 |
+
- Sort events by event time
|
| 132 |
+
- Count each cumulative step only when it occurs after the previous required step
|
| 133 |
|
| 134 |
### 10) Step Conversion Rate
|
| 135 |
|
|
|
|
| 146 |
### 11) New Visitors
|
| 147 |
|
| 148 |
- **Definition**: Visitors whose current period contains their first observed visit date.
|
| 149 |
+
- **Source fields**: `event_name`, `ts` or `timestamp`, `properties.visitor_id`
|
| 150 |
- **Calculation**:
|
| 151 |
- Use `page_view` events only
|
| 152 |
+
- For each `visitor_id`, find earliest timestamp (`first_seen`) from the full available dataset
|
| 153 |
- If event date equals `first_seen` date, classify as `new`
|
| 154 |
- Count distinct `visitor_id` by period
|
| 155 |
|
|
|
|
| 162 |
- If event date is later than first-seen date, classify as `returning`
|
| 163 |
- Count distinct `visitor_id` by period
|
| 164 |
|
| 165 |
+
### 13) Visitor Locations by Country
|
| 166 |
+
|
| 167 |
+
- **Definition**: Page view volume by visitor IP country/region.
|
| 168 |
+
- **Source fields**: `event_name`, `properties.ip`
|
| 169 |
+
- **Calculation**:
|
| 170 |
+
- Filter `event_name == "page_view"`
|
| 171 |
+
- Remove null/empty `properties.ip`
|
| 172 |
+
- Group page views by IP in MongoDB
|
| 173 |
+
- Resolve each IP to a country using the local MaxMind GeoLite2 Country database
|
| 174 |
+
- Group by `country_code` and `country_name`
|
| 175 |
+
- Map color = page view count (`pv`)
|
| 176 |
+
- Private, invalid, unresolved, or unconfigured IPs are grouped as `Unknown`
|
| 177 |
+
|
| 178 |
---
|
| 179 |
|
| 180 |
## Time Aggregation Rules
|
|
|
|
| 185 |
- `week` -> `%G-W%V` (ISO week)
|
| 186 |
- `month` -> `%Y-%m`
|
| 187 |
|
| 188 |
+
Time filtering rules:
|
| 189 |
|
| 190 |
+
- Prefer the indexed MongoDB Date field `ts`
|
| 191 |
+
- Fall back to converting legacy `timestamp` values when `ts` is not present
|
| 192 |
+
- Keep records where `start_time <= event time <= end_time`
|
| 193 |
|
| 194 |
Optional benchmark filtering:
|
| 195 |
|
|
|
|
| 202 |
1. `visitor_id` is an approximate identifier, not a strict user identity.
|
| 203 |
2. For `filter_change_`*, `properties.new_value` may not always represent the actual final filter value; prefer `filters` snapshot for behavioral context.
|
| 204 |
3. If `table_download` is not instrumented, funnel step 4 will under-report by design.
|
| 205 |
+
4. Total UV and Sessions are distinct counts across the full selected time range. They are not calculated by summing per-period trend values.
|
| 206 |
+
5. Funnel steps are ordered by event time. A session only reaches a later step when that step happens after the previous required step.
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
## MongoDB Performance Notes
|
| 211 |
+
|
| 212 |
+
For production deployments, store event time as a MongoDB Date field named `ts`. Keeping only string timestamps forces aggregation pipelines to convert time values at query time and can reduce index usage.
|
| 213 |
+
|
| 214 |
+
Recommended indexes:
|
| 215 |
+
|
| 216 |
+
```javascript
|
| 217 |
+
db.events.createIndex({ ts: 1 })
|
| 218 |
+
db.events.createIndex({ ts: 1, benchmark: 1 })
|
| 219 |
+
db.events.createIndex({ event_name: 1, ts: 1 })
|
| 220 |
+
db.events.createIndex({ session_id: 1, ts: 1 })
|
| 221 |
+
db.events.createIndex({ "properties.visitor_id": 1, ts: 1 })
|
| 222 |
+
db.events.createIndex({ event_name: 1, ts: 1, "properties.ip": 1 })
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
Legacy events with only `timestamp` remain supported, but backfilling `ts` is recommended before running this dashboard against large collections.
|
| 226 |
|
| 227 |
---
|
| 228 |
|
|
|
|
| 233 |
- MongoDB connection URI (`MONGO_URI`)
|
| 234 |
- Mongo database/collection names (defaults supported)
|
| 235 |
|
| 236 |
+
Optional visitor location input:
|
| 237 |
+
|
| 238 |
+
- `GEOIP_DATABASE_PATH`: path to a local MaxMind `GeoLite2-Country.mmdb` file
|
| 239 |
+
- `GEOIP_DATABASE_URL`: URL for a gzipped GeoLite2 Country MMDB download
|
| 240 |
+
- `GEOIP_AUTO_DOWNLOAD`: whether to download and decompress the MMDB when missing
|
| 241 |
+
|
| 242 |
+
The dashboard does not call an external IP lookup API for visitor lookups. By default,
|
| 243 |
+
startup downloads `https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz`
|
| 244 |
+
when `GEOIP_DATABASE_PATH` is missing, decompresses it, and uses the resulting MMDB file
|
| 245 |
+
locally. Set `GEOIP_AUTO_DOWNLOAD=false` if the runtime cannot access the network or if
|
| 246 |
+
you prefer to mount the MMDB yourself. If the database is unavailable, visitor location
|
| 247 |
+
rows are grouped as `Unknown`.
|
| 248 |
+
|
| 249 |
Local commands:
|
| 250 |
|
| 251 |
```bash
|
|
|
|
| 253 |
uv run leaderboard-analytics
|
| 254 |
```
|
| 255 |
|
| 256 |
+
Run quality checks:
|
| 257 |
+
|
| 258 |
+
```bash
|
| 259 |
+
uv run ruff format --check .
|
| 260 |
+
uv run ruff check .
|
| 261 |
+
uv run pytest
|
| 262 |
+
```
|
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
import sys
|
|
|
|
| 3 |
|
| 4 |
# Ensure src-layout package is importable in Hugging Face Spaces runtime.
|
| 5 |
ROOT_DIR = Path(__file__).resolve().parent
|
|
@@ -7,8 +7,9 @@ SRC_DIR = ROOT_DIR / "src"
|
|
| 7 |
if str(SRC_DIR) not in sys.path:
|
| 8 |
sys.path.insert(0, str(SRC_DIR))
|
| 9 |
|
| 10 |
-
from leaderboard_analytics.main import
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
if __name__ == "__main__":
|
| 14 |
-
|
|
|
|
|
|
|
| 1 |
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
|
| 4 |
# Ensure src-layout package is importable in Hugging Face Spaces runtime.
|
| 5 |
ROOT_DIR = Path(__file__).resolve().parent
|
|
|
|
| 7 |
if str(SRC_DIR) not in sys.path:
|
| 8 |
sys.path.insert(0, str(SRC_DIR))
|
| 9 |
|
| 10 |
+
from leaderboard_analytics.main import create_demo, launch_demo # noqa: E402
|
| 11 |
|
| 12 |
+
demo = create_demo()
|
| 13 |
|
| 14 |
if __name__ == "__main__":
|
| 15 |
+
launch_demo(demo)
|
pyproject.toml
CHANGED
|
@@ -12,6 +12,13 @@ dependencies = [
|
|
| 12 |
"python-dotenv>=1.0.1",
|
| 13 |
"pandas>=2.2.3",
|
| 14 |
"plotly>=5.24.1",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
]
|
| 16 |
|
| 17 |
[tool.ruff]
|
|
@@ -34,3 +41,6 @@ build-backend = "hatchling.build"
|
|
| 34 |
|
| 35 |
[tool.hatch.build.targets.wheel]
|
| 36 |
packages = ["src/leaderboard_analytics"]
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"python-dotenv>=1.0.1",
|
| 13 |
"pandas>=2.2.3",
|
| 14 |
"plotly>=5.24.1",
|
| 15 |
+
"geoip2>=4.8.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.optional-dependencies]
|
| 19 |
+
dev = [
|
| 20 |
+
"pytest>=8.3.0",
|
| 21 |
+
"ruff>=0.8.0",
|
| 22 |
]
|
| 23 |
|
| 24 |
[tool.ruff]
|
|
|
|
| 41 |
|
| 42 |
[tool.hatch.build.targets.wheel]
|
| 43 |
packages = ["src/leaderboard_analytics"]
|
| 44 |
+
|
| 45 |
+
[tool.pytest.ini_options]
|
| 46 |
+
pythonpath = ["src"]
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ pydantic-settings>=2.6.0
|
|
| 5 |
python-dotenv>=1.0.1
|
| 6 |
pandas>=2.2.3
|
| 7 |
plotly>=5.24.1
|
|
|
|
|
|
| 5 |
python-dotenv>=1.0.1
|
| 6 |
pandas>=2.2.3
|
| 7 |
plotly>=5.24.1
|
| 8 |
+
geoip2>=4.8.0
|
src/leaderboard_analytics/__init__.py
CHANGED
|
@@ -1,2 +1 @@
|
|
| 1 |
"""Leaderboard analytics package."""
|
| 2 |
-
|
|
|
|
| 1 |
"""Leaderboard analytics package."""
|
|
|
src/leaderboard_analytics/config.py
CHANGED
|
@@ -12,9 +12,14 @@ class Settings(BaseSettings):
|
|
| 12 |
host: str = "0.0.0.0"
|
| 13 |
port: int = 7860
|
| 14 |
gradio_share: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@lru_cache(maxsize=1)
|
| 18 |
def get_settings() -> Settings:
|
| 19 |
return Settings()
|
| 20 |
-
|
|
|
|
| 12 |
host: str = "0.0.0.0"
|
| 13 |
port: int = 7860
|
| 14 |
gradio_share: bool = False
|
| 15 |
+
gradio_ssr_mode: bool = False
|
| 16 |
+
geoip_database_path: str = "GeoLite2-Country.mmdb"
|
| 17 |
+
geoip_database_url: str = (
|
| 18 |
+
"https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
|
| 19 |
+
)
|
| 20 |
+
geoip_auto_download: bool = True
|
| 21 |
|
| 22 |
|
| 23 |
@lru_cache(maxsize=1)
|
| 24 |
def get_settings() -> Settings:
|
| 25 |
return Settings()
|
|
|
src/leaderboard_analytics/db.py
CHANGED
|
@@ -9,7 +9,9 @@ def get_mongo_client() -> MongoClient:
|
|
| 9 |
settings = get_settings()
|
| 10 |
if not settings.mongo_uri:
|
| 11 |
raise ValueError("MONGO_URI is not configured. Please set MONGO_URI in .env file.")
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def get_database(client: MongoClient) -> Database:
|
|
@@ -20,4 +22,3 @@ def get_database(client: MongoClient) -> Database:
|
|
| 20 |
def get_events_collection(db: Database) -> Collection:
|
| 21 |
settings = get_settings()
|
| 22 |
return db[settings.mongo_collection]
|
| 23 |
-
|
|
|
|
| 9 |
settings = get_settings()
|
| 10 |
if not settings.mongo_uri:
|
| 11 |
raise ValueError("MONGO_URI is not configured. Please set MONGO_URI in .env file.")
|
| 12 |
+
client = MongoClient(settings.mongo_uri, serverSelectionTimeoutMS=5000)
|
| 13 |
+
client.admin.command("ping")
|
| 14 |
+
return client
|
| 15 |
|
| 16 |
|
| 17 |
def get_database(client: MongoClient) -> Database:
|
|
|
|
| 22 |
def get_events_collection(db: Database) -> Collection:
|
| 23 |
settings = get_settings()
|
| 24 |
return db[settings.mongo_collection]
|
|
|
src/leaderboard_analytics/geoip_database.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gzip
|
| 2 |
+
import shutil
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from urllib.request import urlopen
|
| 6 |
+
|
| 7 |
+
DEFAULT_GEOIP_DATABASE_URL = (
|
| 8 |
+
"https://cdn.jsdelivr.net/npm/geolite2-country/GeoLite2-Country.mmdb.gz"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def ensure_geoip_database(
|
| 13 |
+
database_path: str | Path,
|
| 14 |
+
source_url: str = DEFAULT_GEOIP_DATABASE_URL,
|
| 15 |
+
*,
|
| 16 |
+
auto_download: bool = True,
|
| 17 |
+
timeout: float = 30.0,
|
| 18 |
+
) -> Path:
|
| 19 |
+
target_path = Path(database_path)
|
| 20 |
+
if target_path.exists() or not auto_download:
|
| 21 |
+
return target_path
|
| 22 |
+
|
| 23 |
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
with tempfile.NamedTemporaryFile(
|
| 25 |
+
prefix=f"{target_path.name}.",
|
| 26 |
+
suffix=".tmp",
|
| 27 |
+
dir=target_path.parent,
|
| 28 |
+
delete=False,
|
| 29 |
+
) as temp_file:
|
| 30 |
+
temp_path = Path(temp_file.name)
|
| 31 |
+
with urlopen(source_url, timeout=timeout) as response:
|
| 32 |
+
with gzip.GzipFile(fileobj=response) as gzip_file:
|
| 33 |
+
shutil.copyfileobj(gzip_file, temp_file)
|
| 34 |
+
|
| 35 |
+
temp_path.replace(target_path)
|
| 36 |
+
return target_path
|
src/leaderboard_analytics/main.py
CHANGED
|
@@ -1,22 +1,49 @@
|
|
| 1 |
from leaderboard_analytics.config import get_settings
|
| 2 |
from leaderboard_analytics.db import get_database, get_events_collection, get_mongo_client
|
|
|
|
| 3 |
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 4 |
from leaderboard_analytics.services import AnalyticsService
|
| 5 |
from leaderboard_analytics.ui import build_dashboard
|
| 6 |
|
| 7 |
|
| 8 |
-
def
|
| 9 |
settings = get_settings()
|
| 10 |
client = get_mongo_client()
|
| 11 |
db = get_database(client)
|
| 12 |
events_collection = get_events_collection(db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
repository = AnalyticsRepository(events_collection=events_collection)
|
| 15 |
-
service = AnalyticsService(
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
if __name__ == "__main__":
|
| 21 |
run()
|
| 22 |
-
|
|
|
|
| 1 |
from leaderboard_analytics.config import get_settings
|
| 2 |
from leaderboard_analytics.db import get_database, get_events_collection, get_mongo_client
|
| 3 |
+
from leaderboard_analytics.geoip_database import ensure_geoip_database
|
| 4 |
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 5 |
from leaderboard_analytics.services import AnalyticsService
|
| 6 |
from leaderboard_analytics.ui import build_dashboard
|
| 7 |
|
| 8 |
|
| 9 |
+
def create_demo():
|
| 10 |
settings = get_settings()
|
| 11 |
client = get_mongo_client()
|
| 12 |
db = get_database(client)
|
| 13 |
events_collection = get_events_collection(db)
|
| 14 |
+
geoip_database_path = settings.geoip_database_path
|
| 15 |
+
try:
|
| 16 |
+
geoip_database_path = str(
|
| 17 |
+
ensure_geoip_database(
|
| 18 |
+
settings.geoip_database_path,
|
| 19 |
+
settings.geoip_database_url,
|
| 20 |
+
auto_download=settings.geoip_auto_download,
|
| 21 |
+
)
|
| 22 |
+
)
|
| 23 |
+
except Exception as exc:
|
| 24 |
+
print(f"GeoIP database download failed: {exc}")
|
| 25 |
|
| 26 |
repository = AnalyticsRepository(events_collection=events_collection)
|
| 27 |
+
service = AnalyticsService(
|
| 28 |
+
repository=repository,
|
| 29 |
+
geoip_database_path=geoip_database_path,
|
| 30 |
+
)
|
| 31 |
+
return build_dashboard(service=service)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def launch_demo(demo) -> None:
|
| 35 |
+
settings = get_settings()
|
| 36 |
+
demo.launch(
|
| 37 |
+
server_name=settings.host,
|
| 38 |
+
server_port=settings.port,
|
| 39 |
+
share=settings.gradio_share,
|
| 40 |
+
ssr_mode=settings.gradio_ssr_mode,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def run() -> None:
|
| 45 |
+
launch_demo(create_demo())
|
| 46 |
|
| 47 |
|
| 48 |
if __name__ == "__main__":
|
| 49 |
run()
|
|
|
src/leaderboard_analytics/repositories.py
CHANGED
|
@@ -11,12 +11,34 @@ def _period_expression(granularity: Granularity) -> dict:
|
|
| 11 |
Granularity.WEEK: "%G-W%V",
|
| 12 |
Granularity.MONTH: "%Y-%m",
|
| 13 |
}
|
| 14 |
-
return {"$dateToString": {"format": format_map[granularity], "date": "$
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
|
| 18 |
matcher: dict = {
|
| 19 |
-
"
|
| 20 |
"$gte": filters.start_time,
|
| 21 |
"$lte": filters.end_time,
|
| 22 |
}
|
|
@@ -26,6 +48,23 @@ def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
|
|
| 26 |
return matcher
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
class AnalyticsRepository:
|
| 30 |
def __init__(self, events_collection: Collection) -> None:
|
| 31 |
self.events_collection = events_collection
|
|
@@ -33,7 +72,8 @@ class AnalyticsRepository:
|
|
| 33 |
def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
|
| 34 |
period_expr = _period_expression(filters.granularity)
|
| 35 |
pipeline: list[dict] = [
|
| 36 |
-
{"$
|
|
|
|
| 37 |
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 38 |
{
|
| 39 |
"$group": {
|
|
@@ -50,27 +90,52 @@ class AnalyticsRepository:
|
|
| 50 |
"period": "$_id.period",
|
| 51 |
"pv": 1,
|
| 52 |
"event_count": 1,
|
| 53 |
-
"session_count":
|
| 54 |
-
"uv":
|
| 55 |
-
"$size": {
|
| 56 |
-
"$filter": {
|
| 57 |
-
"input": "$visitors",
|
| 58 |
-
"as": "v",
|
| 59 |
-
"cond": {"$and": [{"$ne": ["$$v", None]}, {"$ne": ["$$v", ""]}]},
|
| 60 |
-
}
|
| 61 |
-
}
|
| 62 |
-
},
|
| 63 |
}
|
| 64 |
},
|
| 65 |
{"$sort": {"period": 1}},
|
| 66 |
]
|
| 67 |
return list(self.events_collection.aggregate(pipeline))
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def benchmark_top(self, filters: QueryFilters, limit: int = 20) -> list[dict]:
|
| 70 |
pipeline: list[dict] = [
|
| 71 |
-
{"$
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
{"$group": {"_id": "$properties.new_value", "count": {"$sum": 1}}},
|
|
|
|
| 74 |
{"$project": {"_id": 0, "benchmark": "$_id", "count": 1}},
|
| 75 |
{"$sort": {"count": -1}},
|
| 76 |
{"$limit": limit},
|
|
@@ -79,20 +144,27 @@ class AnalyticsRepository:
|
|
| 79 |
|
| 80 |
def filter_distribution(self, filters: QueryFilters) -> list[dict]:
|
| 81 |
pipeline: list[dict] = [
|
| 82 |
-
{"$
|
|
|
|
| 83 |
{
|
| 84 |
"$match": {
|
| 85 |
**_with_time_and_optional_benchmark(filters),
|
| 86 |
"event_name": {"$regex": "^filter_change_"},
|
| 87 |
}
|
| 88 |
},
|
| 89 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
{
|
| 91 |
"$project": {
|
| 92 |
"_id": 0,
|
| 93 |
"event_name": "$_id",
|
| 94 |
"count": 1,
|
| 95 |
-
"session_coverage":
|
| 96 |
}
|
| 97 |
},
|
| 98 |
{"$sort": {"count": -1}},
|
|
@@ -101,41 +173,169 @@ class AnalyticsRepository:
|
|
| 101 |
|
| 102 |
def funnel(self, filters: QueryFilters) -> list[dict]:
|
| 103 |
pipeline: list[dict] = [
|
| 104 |
-
{"$
|
|
|
|
| 105 |
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 106 |
-
{"$
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
{
|
| 108 |
"$project": {
|
| 109 |
-
"
|
| 110 |
-
"
|
| 111 |
-
"
|
| 112 |
-
"$
|
| 113 |
{
|
| 114 |
-
"$
|
| 115 |
-
"
|
| 116 |
-
"
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
}
|
| 121 |
},
|
| 122 |
0,
|
| 123 |
]
|
| 124 |
},
|
| 125 |
-
"has_table_download": {"$in": ["table_download", "$events"]},
|
| 126 |
}
|
| 127 |
},
|
| 128 |
{
|
| 129 |
"$group": {
|
| 130 |
"_id": None,
|
| 131 |
-
"step1_page_view": {
|
|
|
|
|
|
|
| 132 |
"step2_benchmark_change": {
|
| 133 |
-
"$sum": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
},
|
| 135 |
"step3_filter_change": {
|
| 136 |
"$sum": {
|
| 137 |
"$cond": [
|
| 138 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
1,
|
| 140 |
0,
|
| 141 |
]
|
|
@@ -146,10 +346,10 @@ class AnalyticsRepository:
|
|
| 146 |
"$cond": [
|
| 147 |
{
|
| 148 |
"$and": [
|
| 149 |
-
"$
|
| 150 |
-
"$
|
| 151 |
-
"$
|
| 152 |
-
"$
|
| 153 |
]
|
| 154 |
},
|
| 155 |
1,
|
|
@@ -174,10 +374,9 @@ class AnalyticsRepository:
|
|
| 174 |
def visitors_new_vs_returning(self, filters: QueryFilters) -> list[dict]:
|
| 175 |
period_expr = _period_expression(filters.granularity)
|
| 176 |
pipeline: list[dict] = [
|
| 177 |
-
|
| 178 |
{
|
| 179 |
"$match": {
|
| 180 |
-
**_with_time_and_optional_benchmark(filters),
|
| 181 |
"event_name": "page_view",
|
| 182 |
"visitor_id": {"$nin": [None, ""]},
|
| 183 |
}
|
|
@@ -185,31 +384,80 @@ class AnalyticsRepository:
|
|
| 185 |
{
|
| 186 |
"$setWindowFields": {
|
| 187 |
"partitionBy": "$visitor_id",
|
| 188 |
-
"sortBy": {"
|
| 189 |
-
"output": {"first_seen": {"$first": "$
|
| 190 |
}
|
| 191 |
},
|
|
|
|
| 192 |
{
|
| 193 |
"$project": {
|
| 194 |
"period": period_expr,
|
| 195 |
-
"is_new": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
"visitor_id": 1,
|
| 197 |
}
|
| 198 |
},
|
| 199 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
{
|
| 201 |
"$project": {
|
| 202 |
"_id": 0,
|
| 203 |
"period": "$_id.period",
|
| 204 |
"is_new": "$_id.is_new",
|
| 205 |
-
"visitor_count":
|
| 206 |
}
|
| 207 |
},
|
| 208 |
{"$sort": {"period": 1, "is_new": -1}},
|
| 209 |
]
|
| 210 |
return list(self.events_collection.aggregate(pipeline))
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
@staticmethod
|
| 213 |
def safe_first(items: Iterable[dict]) -> dict:
|
| 214 |
return next(iter(items), {})
|
| 215 |
-
|
|
|
|
| 11 |
Granularity.WEEK: "%G-W%V",
|
| 12 |
Granularity.MONTH: "%Y-%m",
|
| 13 |
}
|
| 14 |
+
return {"$dateToString": {"format": format_map[granularity], "date": "$event_ts"}}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _with_normalized_time() -> dict:
|
| 18 |
+
return {
|
| 19 |
+
"$addFields": {
|
| 20 |
+
"event_ts": {"$ifNull": ["$ts", {"$toDate": "$timestamp"}]},
|
| 21 |
+
"visitor_id": "$properties.visitor_id",
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _indexed_time_prefilter(filters: QueryFilters) -> dict:
|
| 27 |
+
matcher: dict = {
|
| 28 |
+
"$or": [
|
| 29 |
+
{"ts": {"$gte": filters.start_time, "$lte": filters.end_time}},
|
| 30 |
+
{"ts": None},
|
| 31 |
+
{"ts": {"$exists": False}},
|
| 32 |
+
]
|
| 33 |
+
}
|
| 34 |
+
if filters.benchmark:
|
| 35 |
+
matcher["benchmark"] = filters.benchmark
|
| 36 |
+
return matcher
|
| 37 |
|
| 38 |
|
| 39 |
def _with_time_and_optional_benchmark(filters: QueryFilters) -> dict:
|
| 40 |
matcher: dict = {
|
| 41 |
+
"event_ts": {
|
| 42 |
"$gte": filters.start_time,
|
| 43 |
"$lte": filters.end_time,
|
| 44 |
}
|
|
|
|
| 48 |
return matcher
|
| 49 |
|
| 50 |
|
| 51 |
+
def _non_empty_set_size(field_name: str, variable_name: str) -> dict:
|
| 52 |
+
return {
|
| 53 |
+
"$size": {
|
| 54 |
+
"$filter": {
|
| 55 |
+
"input": f"${field_name}",
|
| 56 |
+
"as": variable_name,
|
| 57 |
+
"cond": {
|
| 58 |
+
"$and": [
|
| 59 |
+
{"$ne": [f"$${variable_name}", None]},
|
| 60 |
+
{"$ne": [f"$${variable_name}", ""]},
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
class AnalyticsRepository:
|
| 69 |
def __init__(self, events_collection: Collection) -> None:
|
| 70 |
self.events_collection = events_collection
|
|
|
|
| 72 |
def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
|
| 73 |
period_expr = _period_expression(filters.granularity)
|
| 74 |
pipeline: list[dict] = [
|
| 75 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 76 |
+
_with_normalized_time(),
|
| 77 |
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 78 |
{
|
| 79 |
"$group": {
|
|
|
|
| 90 |
"period": "$_id.period",
|
| 91 |
"pv": 1,
|
| 92 |
"event_count": 1,
|
| 93 |
+
"session_count": _non_empty_set_size("sessions", "s"),
|
| 94 |
+
"uv": _non_empty_set_size("visitors", "v"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
}
|
| 96 |
},
|
| 97 |
{"$sort": {"period": 1}},
|
| 98 |
]
|
| 99 |
return list(self.events_collection.aggregate(pipeline))
|
| 100 |
|
| 101 |
+
def overview_totals(self, filters: QueryFilters) -> dict:
|
| 102 |
+
pipeline: list[dict] = [
|
| 103 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 104 |
+
_with_normalized_time(),
|
| 105 |
+
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 106 |
+
{
|
| 107 |
+
"$group": {
|
| 108 |
+
"_id": None,
|
| 109 |
+
"pv": {"$sum": {"$cond": [{"$eq": ["$event_name", "page_view"]}, 1, 0]}},
|
| 110 |
+
"events": {"$sum": 1},
|
| 111 |
+
"sessions": {"$addToSet": "$session_id"},
|
| 112 |
+
"visitors": {"$addToSet": "$visitor_id"},
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"$project": {
|
| 117 |
+
"_id": 0,
|
| 118 |
+
"pv": 1,
|
| 119 |
+
"events": 1,
|
| 120 |
+
"sessions": _non_empty_set_size("sessions", "s"),
|
| 121 |
+
"uv": _non_empty_set_size("visitors", "v"),
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
]
|
| 125 |
+
return self.safe_first(self.events_collection.aggregate(pipeline))
|
| 126 |
+
|
| 127 |
def benchmark_top(self, filters: QueryFilters, limit: int = 20) -> list[dict]:
|
| 128 |
pipeline: list[dict] = [
|
| 129 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 130 |
+
_with_normalized_time(),
|
| 131 |
+
{
|
| 132 |
+
"$match": {
|
| 133 |
+
**_with_time_and_optional_benchmark(filters),
|
| 134 |
+
"event_name": "benchmark_change",
|
| 135 |
+
}
|
| 136 |
+
},
|
| 137 |
{"$group": {"_id": "$properties.new_value", "count": {"$sum": 1}}},
|
| 138 |
+
{"$match": {"_id": {"$nin": [None, ""]}}},
|
| 139 |
{"$project": {"_id": 0, "benchmark": "$_id", "count": 1}},
|
| 140 |
{"$sort": {"count": -1}},
|
| 141 |
{"$limit": limit},
|
|
|
|
| 144 |
|
| 145 |
def filter_distribution(self, filters: QueryFilters) -> list[dict]:
|
| 146 |
pipeline: list[dict] = [
|
| 147 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 148 |
+
_with_normalized_time(),
|
| 149 |
{
|
| 150 |
"$match": {
|
| 151 |
**_with_time_and_optional_benchmark(filters),
|
| 152 |
"event_name": {"$regex": "^filter_change_"},
|
| 153 |
}
|
| 154 |
},
|
| 155 |
+
{
|
| 156 |
+
"$group": {
|
| 157 |
+
"_id": "$event_name",
|
| 158 |
+
"count": {"$sum": 1},
|
| 159 |
+
"sessions": {"$addToSet": "$session_id"},
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
{
|
| 163 |
"$project": {
|
| 164 |
"_id": 0,
|
| 165 |
"event_name": "$_id",
|
| 166 |
"count": 1,
|
| 167 |
+
"session_coverage": _non_empty_set_size("sessions", "s"),
|
| 168 |
}
|
| 169 |
},
|
| 170 |
{"$sort": {"count": -1}},
|
|
|
|
| 173 |
|
| 174 |
def funnel(self, filters: QueryFilters) -> list[dict]:
|
| 175 |
pipeline: list[dict] = [
|
| 176 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 177 |
+
_with_normalized_time(),
|
| 178 |
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 179 |
+
{"$sort": {"session_id": 1, "event_ts": 1}},
|
| 180 |
+
{
|
| 181 |
+
"$group": {
|
| 182 |
+
"_id": "$session_id",
|
| 183 |
+
"events": {"$push": {"name": "$event_name", "ts": "$event_ts"}},
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
{"$match": {"_id": {"$nin": [None, ""]}}},
|
| 187 |
+
{
|
| 188 |
+
"$project": {
|
| 189 |
+
"events": 1,
|
| 190 |
+
"page_view_at": {
|
| 191 |
+
"$arrayElemAt": [
|
| 192 |
+
{
|
| 193 |
+
"$map": {
|
| 194 |
+
"input": {
|
| 195 |
+
"$filter": {
|
| 196 |
+
"input": "$events",
|
| 197 |
+
"as": "event",
|
| 198 |
+
"cond": {"$eq": ["$$event.name", "page_view"]},
|
| 199 |
+
}
|
| 200 |
+
},
|
| 201 |
+
"as": "event",
|
| 202 |
+
"in": "$$event.ts",
|
| 203 |
+
}
|
| 204 |
+
},
|
| 205 |
+
0,
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
}
|
| 209 |
+
},
|
| 210 |
{
|
| 211 |
"$project": {
|
| 212 |
+
"events": 1,
|
| 213 |
+
"page_view_at": 1,
|
| 214 |
+
"benchmark_change_at": {
|
| 215 |
+
"$arrayElemAt": [
|
| 216 |
{
|
| 217 |
+
"$map": {
|
| 218 |
+
"input": {
|
| 219 |
+
"$filter": {
|
| 220 |
+
"input": "$events",
|
| 221 |
+
"as": "event",
|
| 222 |
+
"cond": {
|
| 223 |
+
"$and": [
|
| 224 |
+
{"$eq": ["$$event.name", "benchmark_change"]},
|
| 225 |
+
{"$gte": ["$$event.ts", "$page_view_at"]},
|
| 226 |
+
]
|
| 227 |
+
},
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"as": "event",
|
| 231 |
+
"in": "$$event.ts",
|
| 232 |
+
}
|
| 233 |
+
},
|
| 234 |
+
0,
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
}
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"$project": {
|
| 241 |
+
"events": 1,
|
| 242 |
+
"page_view_at": 1,
|
| 243 |
+
"benchmark_change_at": 1,
|
| 244 |
+
"filter_change_at": {
|
| 245 |
+
"$arrayElemAt": [
|
| 246 |
+
{
|
| 247 |
+
"$map": {
|
| 248 |
+
"input": {
|
| 249 |
+
"$filter": {
|
| 250 |
+
"input": "$events",
|
| 251 |
+
"as": "event",
|
| 252 |
+
"cond": {
|
| 253 |
+
"$and": [
|
| 254 |
+
{
|
| 255 |
+
"$regexMatch": {
|
| 256 |
+
"input": "$$event.name",
|
| 257 |
+
"regex": "^filter_change_",
|
| 258 |
+
}
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"$gte": [
|
| 262 |
+
"$$event.ts",
|
| 263 |
+
"$benchmark_change_at",
|
| 264 |
+
]
|
| 265 |
+
},
|
| 266 |
+
]
|
| 267 |
+
},
|
| 268 |
+
}
|
| 269 |
+
},
|
| 270 |
+
"as": "event",
|
| 271 |
+
"in": "$$event.ts",
|
| 272 |
+
}
|
| 273 |
+
},
|
| 274 |
+
0,
|
| 275 |
+
]
|
| 276 |
+
},
|
| 277 |
+
}
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"$project": {
|
| 281 |
+
"page_view_at": 1,
|
| 282 |
+
"benchmark_change_at": 1,
|
| 283 |
+
"filter_change_at": 1,
|
| 284 |
+
"table_download_at": {
|
| 285 |
+
"$arrayElemAt": [
|
| 286 |
+
{
|
| 287 |
+
"$map": {
|
| 288 |
+
"input": {
|
| 289 |
+
"$filter": {
|
| 290 |
+
"input": "$events",
|
| 291 |
+
"as": "event",
|
| 292 |
+
"cond": {
|
| 293 |
+
"$and": [
|
| 294 |
+
{"$eq": ["$$event.name", "table_download"]},
|
| 295 |
+
{"$gte": ["$$event.ts", "$filter_change_at"]},
|
| 296 |
+
]
|
| 297 |
+
},
|
| 298 |
+
}
|
| 299 |
+
},
|
| 300 |
+
"as": "event",
|
| 301 |
+
"in": "$$event.ts",
|
| 302 |
}
|
| 303 |
},
|
| 304 |
0,
|
| 305 |
]
|
| 306 |
},
|
|
|
|
| 307 |
}
|
| 308 |
},
|
| 309 |
{
|
| 310 |
"$group": {
|
| 311 |
"_id": None,
|
| 312 |
+
"step1_page_view": {
|
| 313 |
+
"$sum": {"$cond": [{"$ne": ["$page_view_at", None]}, 1, 0]}
|
| 314 |
+
},
|
| 315 |
"step2_benchmark_change": {
|
| 316 |
+
"$sum": {
|
| 317 |
+
"$cond": [
|
| 318 |
+
{
|
| 319 |
+
"$and": [
|
| 320 |
+
{"$ne": ["$page_view_at", None]},
|
| 321 |
+
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 322 |
+
]
|
| 323 |
+
},
|
| 324 |
+
1,
|
| 325 |
+
0,
|
| 326 |
+
]
|
| 327 |
+
}
|
| 328 |
},
|
| 329 |
"step3_filter_change": {
|
| 330 |
"$sum": {
|
| 331 |
"$cond": [
|
| 332 |
+
{
|
| 333 |
+
"$and": [
|
| 334 |
+
{"$ne": ["$page_view_at", None]},
|
| 335 |
+
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 336 |
+
{"$gte": ["$filter_change_at", "$benchmark_change_at"]},
|
| 337 |
+
]
|
| 338 |
+
},
|
| 339 |
1,
|
| 340 |
0,
|
| 341 |
]
|
|
|
|
| 346 |
"$cond": [
|
| 347 |
{
|
| 348 |
"$and": [
|
| 349 |
+
{"$ne": ["$page_view_at", None]},
|
| 350 |
+
{"$gte": ["$benchmark_change_at", "$page_view_at"]},
|
| 351 |
+
{"$gte": ["$filter_change_at", "$benchmark_change_at"]},
|
| 352 |
+
{"$gte": ["$table_download_at", "$filter_change_at"]},
|
| 353 |
]
|
| 354 |
},
|
| 355 |
1,
|
|
|
|
| 374 |
def visitors_new_vs_returning(self, filters: QueryFilters) -> list[dict]:
|
| 375 |
period_expr = _period_expression(filters.granularity)
|
| 376 |
pipeline: list[dict] = [
|
| 377 |
+
_with_normalized_time(),
|
| 378 |
{
|
| 379 |
"$match": {
|
|
|
|
| 380 |
"event_name": "page_view",
|
| 381 |
"visitor_id": {"$nin": [None, ""]},
|
| 382 |
}
|
|
|
|
| 384 |
{
|
| 385 |
"$setWindowFields": {
|
| 386 |
"partitionBy": "$visitor_id",
|
| 387 |
+
"sortBy": {"event_ts": 1},
|
| 388 |
+
"output": {"first_seen": {"$first": "$event_ts"}},
|
| 389 |
}
|
| 390 |
},
|
| 391 |
+
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 392 |
{
|
| 393 |
"$project": {
|
| 394 |
"period": period_expr,
|
| 395 |
+
"is_new": {
|
| 396 |
+
"$eq": [
|
| 397 |
+
{"$dateToString": {"format": "%Y-%m-%d", "date": "$event_ts"}},
|
| 398 |
+
{"$dateToString": {"format": "%Y-%m-%d", "date": "$first_seen"}},
|
| 399 |
+
]
|
| 400 |
+
},
|
| 401 |
"visitor_id": 1,
|
| 402 |
}
|
| 403 |
},
|
| 404 |
+
{
|
| 405 |
+
"$group": {
|
| 406 |
+
"_id": {"period": "$period", "is_new": "$is_new"},
|
| 407 |
+
"visitors": {"$addToSet": "$visitor_id"},
|
| 408 |
+
}
|
| 409 |
+
},
|
| 410 |
{
|
| 411 |
"$project": {
|
| 412 |
"_id": 0,
|
| 413 |
"period": "$_id.period",
|
| 414 |
"is_new": "$_id.is_new",
|
| 415 |
+
"visitor_count": _non_empty_set_size("visitors", "v"),
|
| 416 |
}
|
| 417 |
},
|
| 418 |
{"$sort": {"period": 1, "is_new": -1}},
|
| 419 |
]
|
| 420 |
return list(self.events_collection.aggregate(pipeline))
|
| 421 |
|
| 422 |
+
def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
|
| 423 |
+
pipeline: list[dict] = [
|
| 424 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 425 |
+
_with_normalized_time(),
|
| 426 |
+
{
|
| 427 |
+
"$match": {
|
| 428 |
+
**_with_time_and_optional_benchmark(filters),
|
| 429 |
+
"event_name": "page_view",
|
| 430 |
+
"properties.ip": {"$nin": [None, ""]},
|
| 431 |
+
}
|
| 432 |
+
},
|
| 433 |
+
{"$group": {"_id": "$properties.ip", "pv": {"$sum": 1}}},
|
| 434 |
+
{"$project": {"_id": 0, "ip": "$_id", "pv": 1}},
|
| 435 |
+
{"$sort": {"pv": -1}},
|
| 436 |
+
]
|
| 437 |
+
return list(self.events_collection.aggregate(pipeline))
|
| 438 |
+
|
| 439 |
+
def available_benchmarks(
|
| 440 |
+
self, filters: QueryFilters | None = None, limit: int = 100
|
| 441 |
+
) -> list[str]:
|
| 442 |
+
pipeline: list[dict] = []
|
| 443 |
+
if filters is not None:
|
| 444 |
+
pipeline.extend(
|
| 445 |
+
[
|
| 446 |
+
{"$match": _indexed_time_prefilter(filters)},
|
| 447 |
+
_with_normalized_time(),
|
| 448 |
+
{"$match": _with_time_and_optional_benchmark(filters)},
|
| 449 |
+
]
|
| 450 |
+
)
|
| 451 |
+
pipeline.extend(
|
| 452 |
+
[
|
| 453 |
+
{"$match": {"benchmark": {"$nin": [None, ""]}}},
|
| 454 |
+
{"$group": {"_id": "$benchmark"}},
|
| 455 |
+
{"$sort": {"_id": 1}},
|
| 456 |
+
{"$limit": limit},
|
| 457 |
+
]
|
| 458 |
+
)
|
| 459 |
+
return [row["_id"] for row in self.events_collection.aggregate(pipeline)]
|
| 460 |
+
|
| 461 |
@staticmethod
|
| 462 |
def safe_first(items: Iterable[dict]) -> dict:
|
| 463 |
return next(iter(items), {})
|
|
|
src/leaderboard_analytics/schemas.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
from datetime import
|
| 2 |
from enum import StrEnum
|
| 3 |
|
| 4 |
-
from pydantic import BaseModel, Field
|
| 5 |
|
| 6 |
|
| 7 |
class Granularity(StrEnum):
|
|
@@ -12,9 +12,16 @@ class Granularity(StrEnum):
|
|
| 12 |
|
| 13 |
class QueryFilters(BaseModel):
|
| 14 |
start_time: datetime = Field(
|
| 15 |
-
default_factory=lambda: datetime.now(tz=
|
|
|
|
|
|
|
| 16 |
)
|
| 17 |
-
end_time: datetime = Field(default_factory=lambda: datetime.now(tz=
|
| 18 |
benchmark: str | None = None
|
| 19 |
granularity: Granularity = Granularity.DAY
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import UTC, datetime
|
| 2 |
from enum import StrEnum
|
| 3 |
|
| 4 |
+
from pydantic import BaseModel, Field, model_validator
|
| 5 |
|
| 6 |
|
| 7 |
class Granularity(StrEnum):
|
|
|
|
| 12 |
|
| 13 |
class QueryFilters(BaseModel):
|
| 14 |
start_time: datetime = Field(
|
| 15 |
+
default_factory=lambda: datetime.now(tz=UTC).replace(
|
| 16 |
+
hour=0, minute=0, second=0, microsecond=0
|
| 17 |
+
)
|
| 18 |
)
|
| 19 |
+
end_time: datetime = Field(default_factory=lambda: datetime.now(tz=UTC))
|
| 20 |
benchmark: str | None = None
|
| 21 |
granularity: Granularity = Granularity.DAY
|
| 22 |
|
| 23 |
+
@model_validator(mode="after")
|
| 24 |
+
def validate_time_range(self) -> "QueryFilters":
|
| 25 |
+
if self.start_time > self.end_time:
|
| 26 |
+
raise ValueError("start_time must be earlier than or equal to end_time")
|
| 27 |
+
return self
|
src/leaderboard_analytics/services.py
CHANGED
|
@@ -1,35 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 4 |
from leaderboard_analytics.schemas import QueryFilters
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class AnalyticsService:
|
| 8 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
self.repository = repository
|
|
|
|
| 10 |
|
| 11 |
def get_overview(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
|
| 12 |
rows = self.repository.overview_timeseries(filters)
|
| 13 |
frame = pd.DataFrame(rows)
|
| 14 |
-
|
| 15 |
-
empty = {
|
| 16 |
-
"pv": 0,
|
| 17 |
-
"uv": 0,
|
| 18 |
-
"sessions": 0,
|
| 19 |
-
"events": 0,
|
| 20 |
-
"events_per_session": 0.0,
|
| 21 |
-
"sessions_per_visitor": 0.0,
|
| 22 |
-
}
|
| 23 |
-
return frame, empty
|
| 24 |
-
|
| 25 |
totals = {
|
| 26 |
-
"pv": int(
|
| 27 |
-
"uv": int(
|
| 28 |
-
"sessions": int(
|
| 29 |
-
"events": int(
|
| 30 |
}
|
| 31 |
-
totals["events_per_session"] =
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return frame, totals
|
| 34 |
|
| 35 |
def get_benchmark_top(self, filters: QueryFilters) -> pd.DataFrame:
|
|
@@ -60,3 +204,61 @@ class AnalyticsService:
|
|
| 60 |
frame["visitor_type"] = frame["is_new"].map({True: "new", False: "returning"})
|
| 61 |
return frame
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ipaddress
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any, Protocol
|
| 4 |
+
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 8 |
from leaderboard_analytics.schemas import QueryFilters
|
| 9 |
|
| 10 |
+
UNKNOWN_COUNTRY_CODE = "Unknown"
|
| 11 |
+
UNKNOWN_COUNTRY_NAME = "Unknown"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _empty_ip_debug() -> dict[str, object]:
|
| 15 |
+
return {
|
| 16 |
+
"total_unique_ips": 0,
|
| 17 |
+
"total_ip_pv": 0,
|
| 18 |
+
"global_ips": 0,
|
| 19 |
+
"global_ip_pv": 0,
|
| 20 |
+
"private_ips": 0,
|
| 21 |
+
"private_ip_pv": 0,
|
| 22 |
+
"loopback_ips": 0,
|
| 23 |
+
"loopback_ip_pv": 0,
|
| 24 |
+
"reserved_ips": 0,
|
| 25 |
+
"reserved_ip_pv": 0,
|
| 26 |
+
"link_local_ips": 0,
|
| 27 |
+
"link_local_ip_pv": 0,
|
| 28 |
+
"multicast_ips": 0,
|
| 29 |
+
"multicast_ip_pv": 0,
|
| 30 |
+
"unspecified_ips": 0,
|
| 31 |
+
"unspecified_ip_pv": 0,
|
| 32 |
+
"invalid_ips": 0,
|
| 33 |
+
"invalid_ip_pv": 0,
|
| 34 |
+
"top_ip_pv_buckets": {
|
| 35 |
+
"1": 0,
|
| 36 |
+
"2-10": 0,
|
| 37 |
+
"11-100": 0,
|
| 38 |
+
"101-1000": 0,
|
| 39 |
+
">1000": 0,
|
| 40 |
+
},
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _ip_debug_category(ip_address: str) -> str:
|
| 45 |
+
try:
|
| 46 |
+
parsed_ip = ipaddress.ip_address(ip_address.strip())
|
| 47 |
+
except ValueError:
|
| 48 |
+
return "invalid"
|
| 49 |
+
|
| 50 |
+
if parsed_ip.is_global:
|
| 51 |
+
return "global"
|
| 52 |
+
if parsed_ip.is_loopback:
|
| 53 |
+
return "loopback"
|
| 54 |
+
if parsed_ip.is_private:
|
| 55 |
+
return "private"
|
| 56 |
+
if parsed_ip.is_reserved:
|
| 57 |
+
return "reserved"
|
| 58 |
+
if parsed_ip.is_link_local:
|
| 59 |
+
return "link_local"
|
| 60 |
+
if parsed_ip.is_multicast:
|
| 61 |
+
return "multicast"
|
| 62 |
+
if parsed_ip.is_unspecified:
|
| 63 |
+
return "unspecified"
|
| 64 |
+
return "reserved"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _ip_pv_bucket(pv: int) -> str:
|
| 68 |
+
if pv <= 1:
|
| 69 |
+
return "1"
|
| 70 |
+
if pv <= 10:
|
| 71 |
+
return "2-10"
|
| 72 |
+
if pv <= 100:
|
| 73 |
+
return "11-100"
|
| 74 |
+
if pv <= 1000:
|
| 75 |
+
return "101-1000"
|
| 76 |
+
return ">1000"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class GeoIpCountryReader(Protocol):
|
| 80 |
+
def country(self, ip_address: str) -> Any: ...
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class GeoIpResolver:
|
| 84 |
+
def __init__(
|
| 85 |
+
self,
|
| 86 |
+
database_path: str | Path | None = None,
|
| 87 |
+
reader: GeoIpCountryReader | None = None,
|
| 88 |
+
) -> None:
|
| 89 |
+
self.database_path = Path(database_path) if database_path else None
|
| 90 |
+
self._reader = reader
|
| 91 |
+
self._load_attempted = reader is not None
|
| 92 |
+
|
| 93 |
+
def resolve_country(self, ip_address: str) -> tuple[str, str]:
|
| 94 |
+
try:
|
| 95 |
+
parsed_ip = ipaddress.ip_address(ip_address.strip())
|
| 96 |
+
except ValueError:
|
| 97 |
+
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 98 |
+
|
| 99 |
+
if not parsed_ip.is_global:
|
| 100 |
+
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 101 |
+
|
| 102 |
+
reader = self._get_reader()
|
| 103 |
+
if reader is None:
|
| 104 |
+
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
response = reader.country(str(parsed_ip))
|
| 108 |
+
except Exception:
|
| 109 |
+
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 110 |
+
|
| 111 |
+
country = response.country
|
| 112 |
+
if not getattr(country, "iso_code", None):
|
| 113 |
+
country = response.registered_country
|
| 114 |
+
|
| 115 |
+
code = getattr(country, "iso_code", None)
|
| 116 |
+
if not code:
|
| 117 |
+
return UNKNOWN_COUNTRY_CODE, UNKNOWN_COUNTRY_NAME
|
| 118 |
+
|
| 119 |
+
return code, getattr(country, "name", None) or code
|
| 120 |
+
|
| 121 |
+
def debug_status(self) -> dict[str, object]:
|
| 122 |
+
return {
|
| 123 |
+
"database_path": str(self.database_path) if self.database_path else "",
|
| 124 |
+
"database_configured": self.database_path is not None,
|
| 125 |
+
"database_exists": self.database_path.exists() if self.database_path else False,
|
| 126 |
+
"load_attempted": self._load_attempted,
|
| 127 |
+
"reader_loaded": self._reader is not None,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
def _get_reader(self) -> GeoIpCountryReader | None:
|
| 131 |
+
if self._reader is not None:
|
| 132 |
+
return self._reader
|
| 133 |
+
|
| 134 |
+
if self._load_attempted:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
self._load_attempted = True
|
| 138 |
+
if self.database_path is None or not self.database_path.exists():
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
import geoip2.database
|
| 143 |
+
|
| 144 |
+
self._reader = geoip2.database.Reader(str(self.database_path))
|
| 145 |
+
except Exception:
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
return self._reader
|
| 149 |
+
|
| 150 |
|
| 151 |
class AnalyticsService:
|
| 152 |
+
def __init__(
|
| 153 |
+
self,
|
| 154 |
+
repository: AnalyticsRepository,
|
| 155 |
+
geoip_database_path: str | Path | None = None,
|
| 156 |
+
geoip_resolver: GeoIpResolver | None = None,
|
| 157 |
+
) -> None:
|
| 158 |
self.repository = repository
|
| 159 |
+
self.geoip_resolver = geoip_resolver or GeoIpResolver(geoip_database_path)
|
| 160 |
|
| 161 |
def get_overview(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
|
| 162 |
rows = self.repository.overview_timeseries(filters)
|
| 163 |
frame = pd.DataFrame(rows)
|
| 164 |
+
raw_totals = self.repository.overview_totals(filters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
totals = {
|
| 166 |
+
"pv": int(raw_totals.get("pv", 0)),
|
| 167 |
+
"uv": int(raw_totals.get("uv", 0)),
|
| 168 |
+
"sessions": int(raw_totals.get("sessions", 0)),
|
| 169 |
+
"events": int(raw_totals.get("events", 0)),
|
| 170 |
}
|
| 171 |
+
totals["events_per_session"] = (
|
| 172 |
+
round(totals["events"] / totals["sessions"], 2) if totals["sessions"] else 0.0
|
| 173 |
+
)
|
| 174 |
+
totals["sessions_per_visitor"] = (
|
| 175 |
+
round(totals["sessions"] / totals["uv"], 2) if totals["uv"] else 0.0
|
| 176 |
+
)
|
| 177 |
return frame, totals
|
| 178 |
|
| 179 |
def get_benchmark_top(self, filters: QueryFilters) -> pd.DataFrame:
|
|
|
|
| 204 |
frame["visitor_type"] = frame["is_new"].map({True: "new", False: "returning"})
|
| 205 |
return frame
|
| 206 |
|
| 207 |
+
def get_visitor_locations(self, filters: QueryFilters) -> pd.DataFrame:
|
| 208 |
+
frame, _debug = self.get_visitor_location_details(filters)
|
| 209 |
+
return frame
|
| 210 |
+
|
| 211 |
+
def get_visitor_location_details(self, filters: QueryFilters) -> tuple[pd.DataFrame, dict]:
|
| 212 |
+
locations: dict[tuple[str, str], dict[str, int | str]] = {}
|
| 213 |
+
ip_debug = _empty_ip_debug()
|
| 214 |
+
for row in self.repository.visitor_ip_counts(filters):
|
| 215 |
+
ip = str(row.get("ip", "")).strip()
|
| 216 |
+
if not ip:
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
pv = int(row.get("pv", 0))
|
| 220 |
+
category = _ip_debug_category(ip)
|
| 221 |
+
ip_debug["total_unique_ips"] = int(ip_debug["total_unique_ips"]) + 1
|
| 222 |
+
ip_debug["total_ip_pv"] = int(ip_debug["total_ip_pv"]) + pv
|
| 223 |
+
ip_debug[f"{category}_ips"] = int(ip_debug[f"{category}_ips"]) + 1
|
| 224 |
+
ip_debug[f"{category}_ip_pv"] = int(ip_debug[f"{category}_ip_pv"]) + pv
|
| 225 |
+
ip_debug["top_ip_pv_buckets"][_ip_pv_bucket(pv)] += 1 # type: ignore[index]
|
| 226 |
+
|
| 227 |
+
code, name = self.geoip_resolver.resolve_country(ip)
|
| 228 |
+
key = (code, name)
|
| 229 |
+
if key not in locations:
|
| 230 |
+
locations[key] = {
|
| 231 |
+
"country_code": code,
|
| 232 |
+
"country_name": name,
|
| 233 |
+
"pv": 0,
|
| 234 |
+
"ip_count": 0,
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
locations[key]["pv"] = int(locations[key]["pv"]) + pv
|
| 238 |
+
locations[key]["ip_count"] = int(locations[key]["ip_count"]) + 1
|
| 239 |
+
|
| 240 |
+
frame = pd.DataFrame(
|
| 241 |
+
locations.values(),
|
| 242 |
+
columns=["country_code", "country_name", "pv", "ip_count"],
|
| 243 |
+
)
|
| 244 |
+
if frame.empty:
|
| 245 |
+
return frame, ip_debug
|
| 246 |
+
frame = frame.sort_values(["pv", "ip_count"], ascending=[False, False]).reset_index(
|
| 247 |
+
drop=True
|
| 248 |
+
)
|
| 249 |
+
return frame, ip_debug
|
| 250 |
+
|
| 251 |
+
def get_geoip_debug_info(self) -> dict[str, object]:
|
| 252 |
+
debug_status = getattr(self.geoip_resolver, "debug_status", None)
|
| 253 |
+
if debug_status is None:
|
| 254 |
+
return {
|
| 255 |
+
"database_path": "",
|
| 256 |
+
"database_configured": False,
|
| 257 |
+
"database_exists": False,
|
| 258 |
+
"load_attempted": False,
|
| 259 |
+
"reader_loaded": False,
|
| 260 |
+
}
|
| 261 |
+
return debug_status()
|
| 262 |
+
|
| 263 |
+
def get_available_benchmarks(self, filters: QueryFilters | None = None) -> list[str]:
|
| 264 |
+
return self.repository.available_benchmarks(filters)
|
src/leaderboard_analytics/ui.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
from datetime import datetime, timedelta, timezone
|
| 2 |
import math
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
import gradio as gr
|
|
|
|
| 6 |
import plotly.express as px
|
|
|
|
| 7 |
|
| 8 |
from leaderboard_analytics.schemas import Granularity, QueryFilters
|
| 9 |
from leaderboard_analytics.services import AnalyticsService
|
|
@@ -19,7 +24,7 @@ def _to_utc_datetime(value: Any, fallback: datetime) -> datetime:
|
|
| 19 |
if isinstance(value, float) and math.isnan(value):
|
| 20 |
return fallback
|
| 21 |
# Gradio DateTime may return Unix timestamps as numbers.
|
| 22 |
-
dt = datetime.fromtimestamp(value, tz=
|
| 23 |
elif isinstance(value, str):
|
| 24 |
dt = datetime.fromisoformat(value)
|
| 25 |
else:
|
|
@@ -27,60 +32,353 @@ def _to_utc_datetime(value: Any, fallback: datetime) -> datetime:
|
|
| 27 |
|
| 28 |
# Gradio DateTime may return naive datetime values in local time.
|
| 29 |
if dt.tzinfo is None:
|
| 30 |
-
dt = dt.replace(tzinfo=
|
| 31 |
-
return dt.astimezone(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
| 35 |
-
default_end = datetime.now(tz=
|
| 36 |
default_start = (default_end - timedelta(days=7)).replace(microsecond=0)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def query(
|
| 39 |
start_time: datetime | str | None,
|
| 40 |
end_time: datetime | str | None,
|
| 41 |
benchmark: str,
|
| 42 |
granularity: str,
|
| 43 |
-
) -> tuple[
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
visitor_plot = (
|
| 78 |
-
px.bar(visitors_df, x="period", y="visitor_count", color="visitor_type", barmode="group", title="New vs returning visitors")
|
| 79 |
-
if not visitors_df.empty
|
| 80 |
-
else px.bar(title="New vs returning visitors (no data)")
|
| 81 |
-
)
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
with gr.Blocks() as demo:
|
| 86 |
gr.Markdown("# Leaderboard Analytics Dashboard")
|
|
@@ -100,7 +398,12 @@ def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
|
| 100 |
value=default_end,
|
| 101 |
timezone="UTC",
|
| 102 |
)
|
| 103 |
-
benchmark = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
granularity = gr.Dropdown(
|
| 105 |
label="Granularity",
|
| 106 |
choices=[Granularity.DAY.value, Granularity.WEEK.value, Granularity.MONTH.value],
|
|
@@ -108,7 +411,9 @@ def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
|
| 108 |
)
|
| 109 |
refresh = gr.Button("Refresh", variant="primary")
|
| 110 |
|
| 111 |
-
metrics_text = gr.Markdown(
|
|
|
|
|
|
|
| 112 |
|
| 113 |
with gr.Row():
|
| 114 |
overview_plot = gr.Plot(label="Traffic Overview")
|
|
@@ -117,32 +422,60 @@ def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
|
| 117 |
filter_plot = gr.Plot(label="Filter Behavior")
|
| 118 |
funnel_plot = gr.Plot(label="Funnel")
|
| 119 |
visitor_plot = gr.Plot(label="Visitor Segmentation")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
refresh.click(
|
| 122 |
fn=query,
|
| 123 |
inputs=[start_time, end_time, benchmark, granularity],
|
| 124 |
-
outputs=
|
| 125 |
-
metrics_text,
|
| 126 |
-
overview_plot,
|
| 127 |
-
benchmark_plot,
|
| 128 |
-
filter_plot,
|
| 129 |
-
funnel_plot,
|
| 130 |
-
visitor_plot,
|
| 131 |
-
],
|
| 132 |
)
|
| 133 |
|
|
|
|
| 134 |
demo.load(
|
| 135 |
fn=query,
|
| 136 |
inputs=[start_time, end_time, benchmark, granularity],
|
| 137 |
-
outputs=
|
| 138 |
-
metrics_text,
|
| 139 |
-
overview_plot,
|
| 140 |
-
benchmark_plot,
|
| 141 |
-
filter_plot,
|
| 142 |
-
funnel_plot,
|
| 143 |
-
visitor_plot,
|
| 144 |
-
],
|
| 145 |
)
|
| 146 |
|
|
|
|
| 147 |
return demo
|
| 148 |
-
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
+
import tempfile
|
| 3 |
+
import zipfile
|
| 4 |
+
from datetime import UTC, datetime, timedelta
|
| 5 |
+
from pathlib import Path
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
+
import pandas as pd
|
| 10 |
import plotly.express as px
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
|
| 13 |
from leaderboard_analytics.schemas import Granularity, QueryFilters
|
| 14 |
from leaderboard_analytics.services import AnalyticsService
|
|
|
|
| 24 |
if isinstance(value, float) and math.isnan(value):
|
| 25 |
return fallback
|
| 26 |
# Gradio DateTime may return Unix timestamps as numbers.
|
| 27 |
+
dt = datetime.fromtimestamp(value, tz=UTC)
|
| 28 |
elif isinstance(value, str):
|
| 29 |
dt = datetime.fromisoformat(value)
|
| 30 |
else:
|
|
|
|
| 32 |
|
| 33 |
# Gradio DateTime may return naive datetime values in local time.
|
| 34 |
if dt.tzinfo is None:
|
| 35 |
+
dt = dt.replace(tzinfo=UTC)
|
| 36 |
+
return dt.astimezone(UTC)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _empty_plot(title: str):
|
| 40 |
+
return px.line(title=title)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _empty_map(title: str):
|
| 44 |
+
figure = go.Figure()
|
| 45 |
+
_style_visitor_location_map(figure, title)
|
| 46 |
+
return figure
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _query_range_text(filters: QueryFilters) -> str:
|
| 50 |
+
return f"{filters.start_time.isoformat()} to {filters.end_time.isoformat()}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _write_csv_archive(tables: dict[str, pd.DataFrame]) -> str | None:
|
| 54 |
+
if all(table.empty for table in tables.values()):
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
archive = tempfile.NamedTemporaryFile(
|
| 58 |
+
prefix="leaderboard-analytics-", suffix=".zip", delete=False
|
| 59 |
+
)
|
| 60 |
+
archive.close()
|
| 61 |
+
with zipfile.ZipFile(archive.name, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
|
| 62 |
+
for name, table in tables.items():
|
| 63 |
+
zip_file.writestr(f"{name}.csv", table.to_csv(index=False))
|
| 64 |
+
return archive.name
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _visitor_location_top_table(visitor_locations: pd.DataFrame) -> pd.DataFrame:
|
| 68 |
+
if visitor_locations.empty:
|
| 69 |
+
return pd.DataFrame(columns=["Region", "Users"])
|
| 70 |
+
|
| 71 |
+
return (
|
| 72 |
+
visitor_locations.sort_values(["ip_count", "pv"], ascending=[False, False])
|
| 73 |
+
.head(10)
|
| 74 |
+
.rename(columns={"country_name": "Region", "ip_count": "Users"})[["Region", "Users"]]
|
| 75 |
+
.reset_index(drop=True)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _visitor_location_debug_text(
|
| 80 |
+
visitor_locations: pd.DataFrame,
|
| 81 |
+
geoip_debug: dict[str, object],
|
| 82 |
+
ip_debug: dict[str, object] | None = None,
|
| 83 |
+
) -> str:
|
| 84 |
+
if visitor_locations.empty:
|
| 85 |
+
total_pv = 0
|
| 86 |
+
total_users = 0
|
| 87 |
+
mapped_regions = 0
|
| 88 |
+
unknown_pv = 0
|
| 89 |
+
unknown_users = 0
|
| 90 |
+
else:
|
| 91 |
+
unknown_rows = visitor_locations[visitor_locations["country_code"] == "Unknown"]
|
| 92 |
+
mapped_rows = visitor_locations[visitor_locations["country_code"] != "Unknown"]
|
| 93 |
+
total_pv = int(visitor_locations["pv"].sum())
|
| 94 |
+
total_users = int(visitor_locations["ip_count"].sum())
|
| 95 |
+
mapped_regions = len(mapped_rows)
|
| 96 |
+
unknown_pv = int(unknown_rows["pv"].sum()) if not unknown_rows.empty else 0
|
| 97 |
+
unknown_users = int(unknown_rows["ip_count"].sum()) if not unknown_rows.empty else 0
|
| 98 |
+
|
| 99 |
+
configured = "yes" if geoip_debug.get("database_configured") else "no"
|
| 100 |
+
exists = "yes" if geoip_debug.get("database_exists") else "no"
|
| 101 |
+
loaded = "yes" if geoip_debug.get("reader_loaded") else "no"
|
| 102 |
+
attempted = "yes" if geoip_debug.get("load_attempted") else "no"
|
| 103 |
+
path = geoip_debug.get("database_path") or "(not configured)"
|
| 104 |
+
ip_debug = ip_debug or {}
|
| 105 |
+
global_ips = int(ip_debug.get("global_ips", 0))
|
| 106 |
+
global_pv = int(ip_debug.get("global_ip_pv", 0))
|
| 107 |
+
private_ips = int(ip_debug.get("private_ips", 0))
|
| 108 |
+
private_pv = int(ip_debug.get("private_ip_pv", 0))
|
| 109 |
+
loopback_ips = int(ip_debug.get("loopback_ips", 0))
|
| 110 |
+
loopback_pv = int(ip_debug.get("loopback_ip_pv", 0))
|
| 111 |
+
invalid_ips = int(ip_debug.get("invalid_ips", 0))
|
| 112 |
+
invalid_pv = int(ip_debug.get("invalid_ip_pv", 0))
|
| 113 |
+
buckets = ip_debug.get("top_ip_pv_buckets", {})
|
| 114 |
+
|
| 115 |
+
return (
|
| 116 |
+
f"GeoIP DB: configured={configured}, exists={exists}, loaded={loaded}, "
|
| 117 |
+
f"load_attempted={attempted} \n"
|
| 118 |
+
f"GeoIP path: `{path}` \n"
|
| 119 |
+
f"Total location PV: {total_pv} | Users/IPs: {total_users} | "
|
| 120 |
+
f"Mapped regions: {mapped_regions} \n"
|
| 121 |
+
f"Unknown PV: {unknown_pv} | Unknown users/IPs: {unknown_users} \n"
|
| 122 |
+
f"Public IPs: {global_ips} ({global_pv} PV) | Private IPs: {private_ips} "
|
| 123 |
+
f"({private_pv} PV) \n"
|
| 124 |
+
f"Loopback IPs: {loopback_ips} ({loopback_pv} PV) | Invalid IPs: {invalid_ips} "
|
| 125 |
+
f"({invalid_pv} PV) \n"
|
| 126 |
+
f"PV/IP buckets: {buckets}"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _style_visitor_location_map(figure: go.Figure, title: str) -> None:
|
| 131 |
+
figure.update_geos(
|
| 132 |
+
projection_type="mercator",
|
| 133 |
+
showframe=False,
|
| 134 |
+
showcoastlines=True,
|
| 135 |
+
coastlinecolor="#cfd6df",
|
| 136 |
+
coastlinewidth=0.6,
|
| 137 |
+
showcountries=True,
|
| 138 |
+
countrycolor="#cfd6df",
|
| 139 |
+
countrywidth=0.7,
|
| 140 |
+
showland=True,
|
| 141 |
+
landcolor="#eef2f7",
|
| 142 |
+
showocean=True,
|
| 143 |
+
oceancolor="#f8fafc",
|
| 144 |
+
showlakes=True,
|
| 145 |
+
lakecolor="#f8fafc",
|
| 146 |
+
bgcolor="#ffffff",
|
| 147 |
+
lataxis_range=[-55, 75],
|
| 148 |
+
lonaxis_range=[-180, 180],
|
| 149 |
+
)
|
| 150 |
+
figure.update_layout(
|
| 151 |
+
title={"text": title, "x": 0.02, "xanchor": "left"},
|
| 152 |
+
height=560,
|
| 153 |
+
paper_bgcolor="#ffffff",
|
| 154 |
+
plot_bgcolor="#ffffff",
|
| 155 |
+
font={"color": "#1f2937"},
|
| 156 |
+
margin={"l": 0, "r": 0, "t": 52, "b": 0},
|
| 157 |
+
showlegend=False,
|
| 158 |
+
hoverlabel={
|
| 159 |
+
"bgcolor": "#ffffff",
|
| 160 |
+
"bordercolor": "#3b82f6",
|
| 161 |
+
"font_color": "#111827",
|
| 162 |
+
},
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _visitor_location_map(visitor_locations: pd.DataFrame, range_text: str) -> go.Figure:
|
| 167 |
+
map_df = (
|
| 168 |
+
visitor_locations[visitor_locations["country_code"] != "Unknown"].copy()
|
| 169 |
+
if not visitor_locations.empty
|
| 170 |
+
else visitor_locations.copy()
|
| 171 |
+
)
|
| 172 |
+
if map_df.empty:
|
| 173 |
+
return _empty_map(f"Visitor locations by country (no mapped data for {range_text})")
|
| 174 |
+
|
| 175 |
+
max_pv = max(int(map_df["pv"].max()), 1)
|
| 176 |
+
size_ref = 2.0 * max_pv / (52**2)
|
| 177 |
+
figure = go.Figure(
|
| 178 |
+
go.Scattergeo(
|
| 179 |
+
locationmode="country names",
|
| 180 |
+
locations=map_df["country_name"],
|
| 181 |
+
mode="markers",
|
| 182 |
+
text=map_df["country_name"],
|
| 183 |
+
customdata=map_df[["country_code", "pv", "ip_count"]],
|
| 184 |
+
hovertemplate=(
|
| 185 |
+
"<b>%{text}</b><br>"
|
| 186 |
+
"Country code: %{customdata[0]}<br>"
|
| 187 |
+
"PV: %{customdata[1]:,}<br>"
|
| 188 |
+
"Users/IPs: %{customdata[2]:,}<extra></extra>"
|
| 189 |
+
),
|
| 190 |
+
marker={
|
| 191 |
+
"size": map_df["pv"],
|
| 192 |
+
"sizemode": "area",
|
| 193 |
+
"sizeref": size_ref,
|
| 194 |
+
"sizemin": 8,
|
| 195 |
+
"color": "rgba(59, 130, 246, 0.55)",
|
| 196 |
+
"line": {"color": "rgba(37, 99, 235, 0.92)", "width": 1.2},
|
| 197 |
+
},
|
| 198 |
+
)
|
| 199 |
+
)
|
| 200 |
+
_style_visitor_location_map(figure, "Visitor locations by country")
|
| 201 |
+
figure.add_annotation(
|
| 202 |
+
x=0.02,
|
| 203 |
+
y=0.08,
|
| 204 |
+
xref="paper",
|
| 205 |
+
yref="paper",
|
| 206 |
+
text=(
|
| 207 |
+
f"Mapped regions: {len(map_df)}<br>"
|
| 208 |
+
f"Mapped PV: {int(map_df['pv'].sum()):,}<br>"
|
| 209 |
+
f"Users/IPs: {int(map_df['ip_count'].sum()):,}"
|
| 210 |
+
),
|
| 211 |
+
showarrow=False,
|
| 212 |
+
align="left",
|
| 213 |
+
bgcolor="rgba(255, 255, 255, 0.88)",
|
| 214 |
+
bordercolor="rgba(148, 163, 184, 0.55)",
|
| 215 |
+
borderwidth=1,
|
| 216 |
+
font={"color": "#1f2937", "size": 12},
|
| 217 |
+
)
|
| 218 |
+
return figure
|
| 219 |
|
| 220 |
|
| 221 |
def build_dashboard(service: AnalyticsService) -> gr.Blocks:
|
| 222 |
+
default_end = datetime.now(tz=UTC)
|
| 223 |
default_start = (default_end - timedelta(days=7)).replace(microsecond=0)
|
| 224 |
|
| 225 |
+
def load_benchmarks() -> object:
|
| 226 |
+
try:
|
| 227 |
+
benchmarks = service.get_available_benchmarks()
|
| 228 |
+
except Exception:
|
| 229 |
+
benchmarks = []
|
| 230 |
+
return gr.update(choices=[""] + benchmarks, value="")
|
| 231 |
+
|
| 232 |
def query(
|
| 233 |
start_time: datetime | str | None,
|
| 234 |
end_time: datetime | str | None,
|
| 235 |
benchmark: str,
|
| 236 |
granularity: str,
|
| 237 |
+
) -> tuple[
|
| 238 |
+
object,
|
| 239 |
+
object,
|
| 240 |
+
object,
|
| 241 |
+
object,
|
| 242 |
+
object,
|
| 243 |
+
object,
|
| 244 |
+
object,
|
| 245 |
+
object,
|
| 246 |
+
object,
|
| 247 |
+
object,
|
| 248 |
+
object,
|
| 249 |
+
object,
|
| 250 |
+
object,
|
| 251 |
+
object,
|
| 252 |
+
object,
|
| 253 |
+
object,
|
| 254 |
+
]:
|
| 255 |
+
try:
|
| 256 |
+
filters = QueryFilters(
|
| 257 |
+
start_time=_to_utc_datetime(start_time, default_start),
|
| 258 |
+
end_time=_to_utc_datetime(end_time, default_end),
|
| 259 |
+
benchmark=benchmark or None,
|
| 260 |
+
granularity=Granularity(granularity),
|
| 261 |
+
)
|
| 262 |
+
overview_df, totals = service.get_overview(filters)
|
| 263 |
+
benchmark_df = service.get_benchmark_top(filters)
|
| 264 |
+
filter_df = service.get_filter_distribution(filters)
|
| 265 |
+
funnel_df = service.get_funnel(filters)
|
| 266 |
+
visitors_df = service.get_new_vs_returning(filters)
|
| 267 |
+
visitor_locations_df, ip_debug = service.get_visitor_location_details(filters)
|
| 268 |
+
visitor_locations_top_df = _visitor_location_top_table(visitor_locations_df)
|
| 269 |
+
visitor_locations_debug = _visitor_location_debug_text(
|
| 270 |
+
visitor_locations_df,
|
| 271 |
+
service.get_geoip_debug_info(),
|
| 272 |
+
ip_debug,
|
| 273 |
+
)
|
| 274 |
|
| 275 |
+
range_text = _query_range_text(filters)
|
| 276 |
+
if (
|
| 277 |
+
overview_df.empty
|
| 278 |
+
and benchmark_df.empty
|
| 279 |
+
and filter_df.empty
|
| 280 |
+
and visitors_df.empty
|
| 281 |
+
and visitor_locations_df.empty
|
| 282 |
+
):
|
| 283 |
+
metrics = f"No data for {range_text}."
|
| 284 |
+
else:
|
| 285 |
+
metrics = (
|
| 286 |
+
f"Range: {range_text} \n"
|
| 287 |
+
f"PV: {totals['pv']} | UV: {totals['uv']} | Sessions: {totals['sessions']} | "
|
| 288 |
+
f"Events/Session: {totals['events_per_session']} | "
|
| 289 |
+
f"Sessions/Visitor: {totals['sessions_per_visitor']}"
|
| 290 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
+
overview_plot = (
|
| 293 |
+
px.line(
|
| 294 |
+
overview_df,
|
| 295 |
+
x="period",
|
| 296 |
+
y=["pv", "uv", "session_count"],
|
| 297 |
+
title="Traffic overview",
|
| 298 |
+
)
|
| 299 |
+
if not overview_df.empty
|
| 300 |
+
else _empty_plot(f"Traffic overview (no data for {range_text})")
|
| 301 |
+
)
|
| 302 |
+
benchmark_plot = (
|
| 303 |
+
px.bar(benchmark_df, x="benchmark", y="count", title="Benchmark Top")
|
| 304 |
+
if not benchmark_df.empty
|
| 305 |
+
else px.bar(title=f"Benchmark Top (no data for {range_text})")
|
| 306 |
+
)
|
| 307 |
+
filter_plot = (
|
| 308 |
+
px.bar(filter_df, x="event_name", y="count", title="Filter usage")
|
| 309 |
+
if not filter_df.empty
|
| 310 |
+
else px.bar(title=f"Filter usage (no data for {range_text})")
|
| 311 |
+
)
|
| 312 |
+
funnel_plot = px.funnel(funnel_df, x="sessions", y="step", title="Session funnel")
|
| 313 |
+
visitor_plot = (
|
| 314 |
+
px.bar(
|
| 315 |
+
visitors_df,
|
| 316 |
+
x="period",
|
| 317 |
+
y="visitor_count",
|
| 318 |
+
color="visitor_type",
|
| 319 |
+
barmode="group",
|
| 320 |
+
title="New vs returning visitors",
|
| 321 |
+
)
|
| 322 |
+
if not visitors_df.empty
|
| 323 |
+
else px.bar(title=f"New vs returning visitors (no data for {range_text})")
|
| 324 |
+
)
|
| 325 |
+
visitor_locations_plot = _visitor_location_map(visitor_locations_df, range_text)
|
| 326 |
+
csv_archive = _write_csv_archive(
|
| 327 |
+
{
|
| 328 |
+
"overview": overview_df,
|
| 329 |
+
"benchmarks": benchmark_df,
|
| 330 |
+
"filters": filter_df,
|
| 331 |
+
"funnel": funnel_df,
|
| 332 |
+
"visitors": visitors_df,
|
| 333 |
+
"visitor_locations": visitor_locations_df,
|
| 334 |
+
}
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
return (
|
| 338 |
+
metrics,
|
| 339 |
+
overview_plot,
|
| 340 |
+
benchmark_plot,
|
| 341 |
+
filter_plot,
|
| 342 |
+
funnel_plot,
|
| 343 |
+
visitor_plot,
|
| 344 |
+
visitor_locations_plot,
|
| 345 |
+
visitor_locations_debug,
|
| 346 |
+
visitor_locations_top_df,
|
| 347 |
+
overview_df,
|
| 348 |
+
benchmark_df,
|
| 349 |
+
filter_df,
|
| 350 |
+
funnel_df,
|
| 351 |
+
visitors_df,
|
| 352 |
+
visitor_locations_df,
|
| 353 |
+
csv_archive,
|
| 354 |
+
)
|
| 355 |
+
except Exception as exc:
|
| 356 |
+
message = f"Query failed: {exc}"
|
| 357 |
+
empty = pd.DataFrame()
|
| 358 |
+
empty_top = pd.DataFrame(columns=["Region", "Users"])
|
| 359 |
+
return (
|
| 360 |
+
message,
|
| 361 |
+
_empty_plot(message),
|
| 362 |
+
px.bar(title=message),
|
| 363 |
+
px.bar(title=message),
|
| 364 |
+
px.funnel(
|
| 365 |
+
pd.DataFrame({"step": [], "sessions": []}),
|
| 366 |
+
x="sessions",
|
| 367 |
+
y="step",
|
| 368 |
+
title=message,
|
| 369 |
+
),
|
| 370 |
+
px.bar(title=message),
|
| 371 |
+
_empty_map(message),
|
| 372 |
+
message,
|
| 373 |
+
empty_top,
|
| 374 |
+
empty,
|
| 375 |
+
empty,
|
| 376 |
+
empty,
|
| 377 |
+
empty,
|
| 378 |
+
empty,
|
| 379 |
+
empty,
|
| 380 |
+
None,
|
| 381 |
+
)
|
| 382 |
|
| 383 |
with gr.Blocks() as demo:
|
| 384 |
gr.Markdown("# Leaderboard Analytics Dashboard")
|
|
|
|
| 398 |
value=default_end,
|
| 399 |
timezone="UTC",
|
| 400 |
)
|
| 401 |
+
benchmark = gr.Dropdown(
|
| 402 |
+
label="Benchmark",
|
| 403 |
+
choices=[""],
|
| 404 |
+
value="",
|
| 405 |
+
allow_custom_value=True,
|
| 406 |
+
)
|
| 407 |
granularity = gr.Dropdown(
|
| 408 |
label="Granularity",
|
| 409 |
choices=[Granularity.DAY.value, Granularity.WEEK.value, Granularity.MONTH.value],
|
|
|
|
| 411 |
)
|
| 412 |
refresh = gr.Button("Refresh", variant="primary")
|
| 413 |
|
| 414 |
+
metrics_text = gr.Markdown(
|
| 415 |
+
"PV: 0 | UV: 0 | Sessions: 0 | Events/Session: 0 | Sessions/Visitor: 0"
|
| 416 |
+
)
|
| 417 |
|
| 418 |
with gr.Row():
|
| 419 |
overview_plot = gr.Plot(label="Traffic Overview")
|
|
|
|
| 422 |
filter_plot = gr.Plot(label="Filter Behavior")
|
| 423 |
funnel_plot = gr.Plot(label="Funnel")
|
| 424 |
visitor_plot = gr.Plot(label="Visitor Segmentation")
|
| 425 |
+
with gr.Row():
|
| 426 |
+
with gr.Column(scale=2):
|
| 427 |
+
visitor_locations_plot = gr.Plot(label="Visitor Locations")
|
| 428 |
+
with gr.Column(scale=1):
|
| 429 |
+
visitor_locations_debug = gr.Markdown(
|
| 430 |
+
"GeoIP DB: not checked \n"
|
| 431 |
+
"Total location PV: 0 | Users/IPs: 0 | Mapped regions: 0"
|
| 432 |
+
)
|
| 433 |
+
visitor_locations_top_table = gr.DataFrame(
|
| 434 |
+
label="Top 10 Regions",
|
| 435 |
+
interactive=False,
|
| 436 |
+
wrap=True,
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
with gr.Accordion("Raw data", open=False):
|
| 440 |
+
csv_file = gr.File(label="CSV export")
|
| 441 |
+
overview_table = gr.DataFrame(label="Traffic Overview")
|
| 442 |
+
benchmark_table = gr.DataFrame(label="Benchmark Analysis")
|
| 443 |
+
filter_table = gr.DataFrame(label="Filter Behavior")
|
| 444 |
+
funnel_table = gr.DataFrame(label="Funnel")
|
| 445 |
+
visitor_table = gr.DataFrame(label="Visitor Segmentation")
|
| 446 |
+
visitor_locations_table = gr.DataFrame(label="Visitor Locations")
|
| 447 |
+
|
| 448 |
+
outputs = [
|
| 449 |
+
metrics_text,
|
| 450 |
+
overview_plot,
|
| 451 |
+
benchmark_plot,
|
| 452 |
+
filter_plot,
|
| 453 |
+
funnel_plot,
|
| 454 |
+
visitor_plot,
|
| 455 |
+
visitor_locations_plot,
|
| 456 |
+
visitor_locations_debug,
|
| 457 |
+
visitor_locations_top_table,
|
| 458 |
+
overview_table,
|
| 459 |
+
benchmark_table,
|
| 460 |
+
filter_table,
|
| 461 |
+
funnel_table,
|
| 462 |
+
visitor_table,
|
| 463 |
+
visitor_locations_table,
|
| 464 |
+
csv_file,
|
| 465 |
+
]
|
| 466 |
|
| 467 |
refresh.click(
|
| 468 |
fn=query,
|
| 469 |
inputs=[start_time, end_time, benchmark, granularity],
|
| 470 |
+
outputs=outputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
)
|
| 472 |
|
| 473 |
+
demo.load(fn=load_benchmarks, outputs=benchmark)
|
| 474 |
demo.load(
|
| 475 |
fn=query,
|
| 476 |
inputs=[start_time, end_time, benchmark, granularity],
|
| 477 |
+
outputs=outputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
)
|
| 479 |
|
| 480 |
+
Path(tempfile.gettempdir()).mkdir(parents=True, exist_ok=True)
|
| 481 |
return demo
|
|
|
tests/test_geoip_database.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gzip
|
| 2 |
+
|
| 3 |
+
from leaderboard_analytics.geoip_database import ensure_geoip_database
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_ensure_geoip_database_downloads_and_decompresses_gzip(tmp_path) -> None:
|
| 7 |
+
source = tmp_path / "GeoLite2-Country.mmdb.gz"
|
| 8 |
+
target = tmp_path / "GeoLite2-Country.mmdb"
|
| 9 |
+
expected_bytes = b"fake-mmdb-bytes"
|
| 10 |
+
|
| 11 |
+
with gzip.open(source, "wb") as gzip_file:
|
| 12 |
+
gzip_file.write(expected_bytes)
|
| 13 |
+
|
| 14 |
+
result = ensure_geoip_database(target, source.as_uri())
|
| 15 |
+
|
| 16 |
+
assert result == target
|
| 17 |
+
assert target.read_bytes() == expected_bytes
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_ensure_geoip_database_keeps_existing_file(tmp_path) -> None:
|
| 21 |
+
source = tmp_path / "missing.mmdb.gz"
|
| 22 |
+
target = tmp_path / "GeoLite2-Country.mmdb"
|
| 23 |
+
expected_bytes = b"existing-mmdb-bytes"
|
| 24 |
+
target.write_bytes(expected_bytes)
|
| 25 |
+
|
| 26 |
+
result = ensure_geoip_database(target, source.as_uri())
|
| 27 |
+
|
| 28 |
+
assert result == target
|
| 29 |
+
assert target.read_bytes() == expected_bytes
|
tests/test_repositories.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import UTC, datetime
|
| 2 |
+
|
| 3 |
+
from leaderboard_analytics.repositories import AnalyticsRepository
|
| 4 |
+
from leaderboard_analytics.schemas import QueryFilters
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class CapturingCollection:
|
| 8 |
+
def __init__(self, rows: list[dict] | None = None) -> None:
|
| 9 |
+
self.rows = rows or []
|
| 10 |
+
self.pipeline: list[dict] | None = None
|
| 11 |
+
|
| 12 |
+
def aggregate(self, pipeline: list[dict]):
|
| 13 |
+
self.pipeline = pipeline
|
| 14 |
+
return iter(self.rows)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _filters() -> QueryFilters:
|
| 18 |
+
return QueryFilters(
|
| 19 |
+
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 20 |
+
end_time=datetime(2026, 1, 31, tzinfo=UTC),
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_funnel_pipeline_preserves_ordered_step_logic() -> None:
|
| 25 |
+
collection = CapturingCollection()
|
| 26 |
+
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 27 |
+
|
| 28 |
+
repository.funnel(_filters())
|
| 29 |
+
|
| 30 |
+
assert collection.pipeline is not None
|
| 31 |
+
assert {"$sort": {"session_id": 1, "event_ts": 1}} in collection.pipeline
|
| 32 |
+
assert any(
|
| 33 |
+
"$push" in stage.get("$group", {}).get("events", {}) for stage in collection.pipeline
|
| 34 |
+
)
|
| 35 |
+
assert not any(
|
| 36 |
+
"$addToSet" in str(stage) and "events" in str(stage) for stage in collection.pipeline
|
| 37 |
+
)
|
| 38 |
+
assert any(
|
| 39 |
+
"table_download_at" in str(stage) and "$filter_change_at" in str(stage)
|
| 40 |
+
for stage in collection.pipeline
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_new_vs_returning_pipeline_computes_first_seen_before_range_match() -> None:
|
| 45 |
+
collection = CapturingCollection()
|
| 46 |
+
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 47 |
+
|
| 48 |
+
repository.visitors_new_vs_returning(_filters())
|
| 49 |
+
|
| 50 |
+
assert collection.pipeline is not None
|
| 51 |
+
window_index = next(
|
| 52 |
+
i for i, stage in enumerate(collection.pipeline) if "$setWindowFields" in stage
|
| 53 |
+
)
|
| 54 |
+
range_match_index = next(
|
| 55 |
+
i
|
| 56 |
+
for i, stage in enumerate(collection.pipeline)
|
| 57 |
+
if stage.get("$match", {}).get("event_ts") is not None
|
| 58 |
+
)
|
| 59 |
+
assert window_index < range_match_index
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_overview_totals_filters_empty_identifiers() -> None:
|
| 63 |
+
collection = CapturingCollection([{"pv": 1, "uv": 1, "sessions": 1, "events": 2}])
|
| 64 |
+
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 65 |
+
|
| 66 |
+
totals = repository.overview_totals(_filters())
|
| 67 |
+
|
| 68 |
+
assert totals == {"pv": 1, "uv": 1, "sessions": 1, "events": 2}
|
| 69 |
+
assert collection.pipeline is not None
|
| 70 |
+
pipeline_text = str(collection.pipeline)
|
| 71 |
+
assert '"$sessions"' in pipeline_text or "'$sessions'" in pipeline_text
|
| 72 |
+
assert '"$visitors"' in pipeline_text or "'$visitors'" in pipeline_text
|
| 73 |
+
assert "$$s" in pipeline_text
|
| 74 |
+
assert "$$v" in pipeline_text
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_visitor_ip_counts_groups_page_view_ips_with_existing_filters() -> None:
|
| 78 |
+
collection = CapturingCollection([{"ip": "8.8.8.8", "pv": 3}])
|
| 79 |
+
repository = AnalyticsRepository(collection) # type: ignore[arg-type]
|
| 80 |
+
filters = QueryFilters(
|
| 81 |
+
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 82 |
+
end_time=datetime(2026, 1, 31, tzinfo=UTC),
|
| 83 |
+
benchmark="MTEB",
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
rows = repository.visitor_ip_counts(filters)
|
| 87 |
+
|
| 88 |
+
assert rows == [{"ip": "8.8.8.8", "pv": 3}]
|
| 89 |
+
assert collection.pipeline is not None
|
| 90 |
+
pipeline_text = str(collection.pipeline)
|
| 91 |
+
assert "properties.ip" in pipeline_text
|
| 92 |
+
assert "page_view" in pipeline_text
|
| 93 |
+
assert "MTEB" in pipeline_text
|
| 94 |
+
assert "$nin" in pipeline_text
|
| 95 |
+
assert "$properties.ip" in pipeline_text
|
tests/test_schemas.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import UTC, datetime
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from pydantic import ValidationError
|
| 5 |
+
|
| 6 |
+
from leaderboard_analytics.schemas import QueryFilters
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_query_filters_rejects_invalid_time_range() -> None:
|
| 10 |
+
with pytest.raises(
|
| 11 |
+
ValidationError, match="start_time must be earlier than or equal to end_time"
|
| 12 |
+
):
|
| 13 |
+
QueryFilters(
|
| 14 |
+
start_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 15 |
+
end_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 16 |
+
)
|
tests/test_services.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import UTC, datetime
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from leaderboard_analytics.schemas import QueryFilters
|
| 5 |
+
from leaderboard_analytics.services import AnalyticsService
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FakeRepository:
|
| 9 |
+
def overview_timeseries(self, filters: QueryFilters) -> list[dict]:
|
| 10 |
+
return [
|
| 11 |
+
{"period": "2026-01-01", "pv": 2, "uv": 1, "session_count": 1, "event_count": 3},
|
| 12 |
+
{"period": "2026-01-02", "pv": 1, "uv": 1, "session_count": 1, "event_count": 2},
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
def overview_totals(self, filters: QueryFilters) -> dict:
|
| 16 |
+
return {"pv": 3, "uv": 1, "sessions": 1, "events": 5}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LocationRepository:
|
| 20 |
+
def __init__(self, rows: list[dict]) -> None:
|
| 21 |
+
self.rows = rows
|
| 22 |
+
|
| 23 |
+
def visitor_ip_counts(self, filters: QueryFilters) -> list[dict]:
|
| 24 |
+
return self.rows
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class FakeGeoIpResolver:
|
| 28 |
+
def __init__(self, countries: dict[str, tuple[str, str]]) -> None:
|
| 29 |
+
self.countries = countries
|
| 30 |
+
|
| 31 |
+
def resolve_country(self, ip_address: str) -> tuple[str, str]:
|
| 32 |
+
return self.countries[ip_address]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_overview_uses_full_range_distinct_totals() -> None:
|
| 36 |
+
service = AnalyticsService(FakeRepository()) # type: ignore[arg-type]
|
| 37 |
+
filters = QueryFilters(
|
| 38 |
+
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 39 |
+
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
frame, totals = service.get_overview(filters)
|
| 43 |
+
|
| 44 |
+
assert list(frame["period"]) == ["2026-01-01", "2026-01-02"]
|
| 45 |
+
assert totals == {
|
| 46 |
+
"pv": 3,
|
| 47 |
+
"uv": 1,
|
| 48 |
+
"sessions": 1,
|
| 49 |
+
"events": 5,
|
| 50 |
+
"events_per_session": 5.0,
|
| 51 |
+
"sessions_per_visitor": 1.0,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_visitor_locations_groups_pv_and_ip_count_by_country() -> None:
|
| 56 |
+
repository = LocationRepository(
|
| 57 |
+
[
|
| 58 |
+
{"ip": "8.8.8.8", "pv": 3},
|
| 59 |
+
{"ip": "8.8.4.4", "pv": 2},
|
| 60 |
+
{"ip": "1.1.1.1", "pv": 4},
|
| 61 |
+
]
|
| 62 |
+
)
|
| 63 |
+
resolver = FakeGeoIpResolver(
|
| 64 |
+
{
|
| 65 |
+
"8.8.8.8": ("US", "United States"),
|
| 66 |
+
"8.8.4.4": ("US", "United States"),
|
| 67 |
+
"1.1.1.1": ("AU", "Australia"),
|
| 68 |
+
}
|
| 69 |
+
)
|
| 70 |
+
service = AnalyticsService(
|
| 71 |
+
repository, # type: ignore[arg-type]
|
| 72 |
+
geoip_resolver=resolver, # type: ignore[arg-type]
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
frame = service.get_visitor_locations(
|
| 76 |
+
QueryFilters(
|
| 77 |
+
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 78 |
+
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
assert frame.to_dict("records") == [
|
| 83 |
+
{"country_code": "US", "country_name": "United States", "pv": 5, "ip_count": 2},
|
| 84 |
+
{"country_code": "AU", "country_name": "Australia", "pv": 4, "ip_count": 1},
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_visitor_locations_groups_unresolved_ips_as_unknown() -> None:
|
| 89 |
+
repository = LocationRepository(
|
| 90 |
+
[
|
| 91 |
+
{"ip": "10.0.0.1", "pv": 2},
|
| 92 |
+
{"ip": "not-an-ip", "pv": 1},
|
| 93 |
+
{"ip": "8.8.8.8", "pv": 3},
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
service = AnalyticsService(
|
| 97 |
+
repository, # type: ignore[arg-type]
|
| 98 |
+
geoip_database_path=Path("missing-geolite2-country.mmdb"),
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
frame = service.get_visitor_locations(
|
| 102 |
+
QueryFilters(
|
| 103 |
+
start_time=datetime(2026, 1, 1, tzinfo=UTC),
|
| 104 |
+
end_time=datetime(2026, 1, 2, tzinfo=UTC),
|
| 105 |
+
)
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
assert frame.to_dict("records") == [
|
| 109 |
+
{"country_code": "Unknown", "country_name": "Unknown", "pv": 6, "ip_count": 3}
|
| 110 |
+
]
|