diff --git a/.gitignore b/.gitignore index c089b4b..397f37b 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,11 @@ poetry.toml # ruff .ruff_cache/ -# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file +# End of https://www.toptal.com/developers/gitignore/api/python + +# Project specific +results/ + +# OSMnx cache (temporary files) +cache/ +analysis/cache/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1269957 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,97 @@ +# Changelog - Bikenv Prediction Platform + +## 2025-12-27 - Initial Implementation + +### Added +- **Data Retrieval Script** (`scripts/retrieve_data.py`) + - Manual data entry from Copenhagenize Index 2025 edition + - Function to fetch and save top 30 cities with scores + - Notes for future automated scraping implementation + +- **Index Calculation Functions** (`scripts/calculate_indices.py`) + - `calculate_altitude_index()`: Measures city hilliness using OSM elevation data + - `calculate_distance_index()`: Measures network connectivity/compactness + - Both functions integrated with OSMnx for real geographic data + +- **Analysis Platform** (`analysis/prediction_platform.py`) + - Comprehensive hypothesis testing framework + - Statistical analysis (Pearson, Spearman correlations) + - Linear regression modeling + - Automated visualization generation + - CSV export of results + +- **Demo Mode** (`analysis/demo_platform.py`) + - Simplified version with synthetic data + - No API dependencies required + - Quick testing and validation + +- **Project Structure** + - `data/` - Reference datasets + - `scripts/` - Data retrieval and calculation utilities + - `analysis/` - Main platform and demo scripts + - `results/` - Output directory for plots and CSVs + +- **Documentation** + - Comprehensive README with methodology and usage + - Structure verification script + - Requirements file for dependencies + +### Changed +- **Updated to Copenhagenize Index 2025 Edition** + - Previous: Referenced "Global Bicycle Cities Index 2022" + - Current: **Copenhagenize Index 2025 (EIT Urban Mobility Edition)** + - Reason: 2025 is the latest available edition + - Source: https://copenhagenizeindex.eu/ + +- **Data Attribution Improvements** + - Added full source citation: "The Global Ranking of Bicycle-Friendly Cities" + - Included publisher: Copenhagenize Design Company & EIT Urban Mobility + - Added direct link to official website + - Clarified data retrieval date and method + +### Dataset Details + +**Copenhagenize Index 2025 Edition** +- Top 30 cities included (from 100 total ranked) +- Score range: 50.3 (Vancouver) to 71.1 (Utrecht) +- Countries represented: 15 +- Top countries: France (5), Netherlands (4), Germany (3), Canada (3) + +### Hypotheses Tested + +1. **H1**: Lower altitude index (A_i) correlates with higher bicycle scores + - Expected: Flat cities are more bike-friendly + +2. **H2**: Distance index (D_i) closer to 1 correlates with higher bicycle scores + - Expected: Better-connected networks are more bike-friendly + +### Technical Stack + +- Python 3.12+ +- pandas, numpy, matplotlib, seaborn +- scipy (statistical analysis) +- scikit-learn (regression) +- osmnx, networkx (geographic analysis) +- geopandas (spatial data) + +### Known Limitations + +1. Sample size limited to 15 cities for computational efficiency +2. Requires OpenStreetMap API access for real data +3. Elevation data may require Google Elevation API key +4. Analysis time: 10-30 minutes per run with real data + +### Future Enhancements + +- [ ] Automated web scraping for data updates +- [ ] Expand to all 100 cities in index +- [ ] Add weather/climate indices +- [ ] Integrate bike infrastructure metrics +- [ ] Develop combined predictive model +- [ ] Real-time data validation + +--- + +**Contributors**: Brandon Trigueros Lara +**Project**: TCU - SIMOVI Lab, Universidad de Costa Rica +**Issue**: bikenv#2 diff --git a/README.md b/README.md index 9335b07..70b0f31 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ # bikenv: Environmental factors that affect cycling -Topographical and climatic indexes to quantify their effect on cycling. \ No newline at end of file +Topographical and climatic indexes to quantify their effect on cycling. + +## Project Structure + +This is a research analysis project, not a Python package. The structure is: + +- `scripts/` - Core calculation functions (altitude_index, distance_index) +- `analysis/` - Statistical analysis and hypothesis testing platform +- `data/` - Copenhagenize Index 2025 Edition reference data +- `results/` - Generated analysis outputs (CSV, plots) +- `requirements-platform.txt` - Python dependencies + +**Note:** This project was previously structured as an installable package with `setup.py` and a `bikenv/` module, but has been refactored into a scripts-based analysis platform. All dependencies are managed via `requirements-platform.txt`. \ No newline at end of file diff --git a/analysis/README.md b/analysis/README.md new file mode 100644 index 0000000..7644bdc --- /dev/null +++ b/analysis/README.md @@ -0,0 +1,227 @@ +# Bikenv Prediction Platform + +**Issue #2**: Platform to test prediction capabilities of altitude and distance indices + +## Overview + +This platform evaluates the prediction capabilities of the proposed `altitude_index (A_i)` and `distance_index (D_i)` using data from the **Copenhagenize Index 2025 Edition** as a reference. + +**Data Source**: [The Global Ranking of Bicycle-Friendly Cities](https://copenhagenizeindex.eu/) (Copenhagenize Index - EIT Urban Mobility Edition 2025) + +## Hypotheses + +This platform tests two hypotheses: + +1. **Hypothesis 1**: The lower the `A_i` (altitude index), the better for cycling +2. **Hypothesis 2**: The closer to 1 the `D_i` (distance index), the better for cycling + +## Methodology + +### Altitude Index (A_i) + +The altitude index quantifies the hilliness of a city by measuring elevation changes across the road network: + +``` +A_i = (mean_elevation_change / mean_edge_length) × 100 +``` + +Where: +- `mean_elevation_change`: Average elevation difference across road segments (meters) +- `mean_edge_length`: Average length of road segments (meters) + +**Interpretation**: Lower values indicate flatter terrain, which is expected to correlate with better cycling conditions. + +### Distance Index (D_i) + +The distance index measures the connectivity and compactness of a city's cycling network: + +``` +D_i = circuity / (1 + normalized_node_density) +``` + +Where: +- `circuity`: Ratio of network distances to straight-line distances (1.0 = perfectly direct routes) +- `normalized_node_density`: Number of intersections per km², normalized to [0, 1] + +**Interpretation**: Values closer to 1 indicate better connectivity with more direct routes. + +### Data Source + +The **Copenhagenize Index 2025 Edition** (official name: "The Global Ranking of Bicycle-Friendly Cities") ranks the top 100 bicycle-friendly cities globally based on 13 indicators across 3 pillars: Infrastructure, Usage, and Policy. + +This platform uses the **top 30 cities** as reference data, with scores ranging from 50.3 (Vancouver) to 71.1 (Utrecht). + +**Source**: https://copenhagenizeindex.eu/ +**Publisher**: Copenhagenize Design Company & EIT Urban Mobility +**Data retrieved**: December 2025 + +## Project Structure + +``` +bikenv/ +├── data/ +│ ├── copenhagenize_index_2025.csv # Reference data (2025 edition) +│ └── copenhagenize_index_2022.csv # Legacy data (deprecated) +├── scripts/ +│ ├── retrieve_data.py # Script to fetch latest index data +│ └── calculate_indices.py # Functions to calculate A_i and D_i +├── analysis/ +│ └── prediction_platform.py # Main analysis script +├── results/ # Output directory +│ ├── cities_with_indices.csv # Cities with calculated indices +│ ├── statistical_results.csv # Correlation and regression results +│ └── hypothesis_testing_results.png # Visualization plots +└── requirements-platform.txt # Python dependencies +``` + +## Installation + +1. **Clone the repository** (if not already done): + ```bash + git clone https://github.com/simovilab/bikenv.git + cd bikenv + ``` + +2. **Install dependencies**: + ```bash + pip install -r requirements-platform.txt + ``` + + Or using conda: + ```bash + conda install pandas numpy matplotlib seaborn scipy scikit-learn + conda install -c conda-forge osmnx + ``` + +## Usage + +### Running the Complete Analysis + +From the `analysis/` directory: + +```bash +cd analysis +python prediction_platform.py +``` + +This will: +1. Load the Copenhagenize Index 2025 data +2. Sample 15 cities across different performance tiers +3. Calculate `A_i` and `D_i` for each city using OpenStreetMap data +4. Perform statistical analysis (correlation, regression) +5. Generate visualizations +6. Save results to the `results/` directory + +**Note**: The analysis may take 10-30 minutes depending on network speed and API rate limits, as it downloads geographic data for each city. + +### Calculating Indices for Individual Cities + +```python +from scripts.calculate_indices import calculate_indices_for_city + +# Calculate indices for a single city +altitude_idx, distance_idx = calculate_indices_for_city("Amsterdam", "Netherlands") + +print(f"Altitude Index: {altitude_idx:.3f}") +print(f"Distance Index: {distance_idx:.3f}") +``` + +## Statistical Methods + +The platform employs multiple statistical approaches: + +1. **Pearson Correlation**: Measures linear relationship strength +2. **Spearman Correlation**: Measures monotonic relationship (rank-based) +3. **Linear Regression**: Models the relationship and calculates R² score +4. **Significance Testing**: p-values < 0.05 indicate statistical significance + +### Interpretation Criteria + +- **Strong support**: |r| > 0.5 and p < 0.05 +- **Moderate support**: 0.3 < |r| < 0.5 and p < 0.05 +- **Weak support**: |r| < 0.3 and p < 0.05 +- **Not significant**: p ≥ 0.05 + +## Output Files + +After running the analysis, the following files are generated in `results/`: + +1. **cities_with_indices.csv**: Complete dataset with calculated indices +2. **statistical_results.csv**: Summary of correlation and regression analysis +3. **hypothesis_testing_results.png**: 4-panel visualization showing: + - Altitude Index vs Score scatter plot + - Distance Index vs Score scatter plot + - Distance from Optimal D_i vs Score + - Correlation heatmap + +## Expected Results + +Based on urban cycling research, we expect: + +- **Negative correlation** between `A_i` and cycling scores (flatter cities rank higher) +- **Cities with D_i ≈ 1** to have higher scores (better network connectivity) + +Top-performing cities like Utrecht, Copenhagen, and Amsterdam are expected to have: +- Low `A_i` values (< 2.0, indicating flat terrain) +- `D_i` values close to 1.0 (indicating efficient, direct networks) + +## Limitations + +1. **Sample Size**: Analysis uses 15 cities for computational efficiency +2. **API Dependencies**: Requires OpenStreetMap data access +3. **Elevation Data**: May require Google Elevation API key for accurate altitude calculations +4. **Network Complexity**: Simplified metrics may not capture all aspects of cyclability + +## Future Improvements + +- [ ] Expand sample size to all 30 cities +- [ ] Add weather/climate index +- [ ] Incorporate bike infrastructure data (protected lanes, bike parking) +- [ ] Test against modal share data (% of trips by bicycle) +- [ ] Develop combined predictive model + +## References + +- **Copenhagenize Index**: https://copenhagenizeindex.eu/ +- **OSMnx Documentation**: https://osmnx.readthedocs.io/ +- **GTFS and Urban Mobility**: https://gtfs.org/ + +## License + +MIT License - See repository LICENSE file + +## Author + +**Brandon Trigueros Lara** +TCU Project - SIMOVI Lab, Universidad de Costa Rica +December 2025 + +--- + +## Quick Start Example + +```python +# Quick test with sample cities +import pandas as pd +from scripts.calculate_indices import calculate_indices_for_city + +# Test with Amsterdam +print("Calculating indices for Amsterdam...") +a_i, d_i = calculate_indices_for_city("Amsterdam", "Netherlands") + +print(f"\nAmsterdam Results:") +print(f" Altitude Index: {a_i:.3f} (lower is better)") +print(f" Distance Index: {d_i:.3f} (closer to 1 is better)") + +# Load reference data to compare +df = pd.read_csv('../data/copenhagenize_index_2022.csv') +amsterdam_score = df[df['city'] == 'Amsterdam']['score'].values[0] + +print(f" Copenhagenize Score: {amsterdam_score} (rank #4)") +``` + +## Contact + +For questions or issues, please open an issue on GitHub or contact: +- brandon.trigueros@ucr.ac.cr +- Laboratory: SIMOVI - UCR diff --git a/analysis/demo_platform.py b/analysis/demo_platform.py new file mode 100644 index 0000000..5031d2f --- /dev/null +++ b/analysis/demo_platform.py @@ -0,0 +1,152 @@ +""" +Simplified Demo - Testing Platform Proof of Concept + +This script demonstrates the platform with synthetic/mock data for cities, +allowing testing without API dependencies. +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy.stats import pearsonr +import os + + +def generate_mock_indices(df: pd.DataFrame) -> pd.DataFrame: + """ + Generate mock altitude and distance indices based on expected patterns. + + This is for demonstration purposes only. + Top-performing cities (high scores) should have: + - Lower altitude indices (flatter terrain) + - Distance indices closer to 1 (better connectivity) + """ + np.random.seed(42) + + # Generate altitude indices with negative correlation to score + # Top cities (high score) get low A_i, bottom cities get high A_i + df['altitude_index'] = 5.0 - (df['score'] / 100) * 4.0 + np.random.normal(0, 0.5, len(df)) + df['altitude_index'] = df['altitude_index'].clip(lower=0.5) + + # Generate distance indices with values closer to 1 for high-scoring cities + # Top cities get D_i close to 1, others deviate more + base_distance = 1.0 + df['distance_index'] = base_distance + ((100 - df['score']) / 200) + np.random.normal(0, 0.1, len(df)) + df['distance_index'] = df['distance_index'].clip(lower=0.5, upper=2.0) + + return df + + +def demo_hypothesis_testing(): + """Run a simplified demonstration of the hypothesis testing.""" + + print("="*70) + print("BIKENV PREDICTION PLATFORM - DEMO MODE") + print("Testing with Mock Data") + print("Data: Copenhagenize Index 2025 Edition") + print("="*70) + + # Load the Copenhagenize data + df = pd.read_csv('../data/copenhagenize_index_2025.csv') + print(f"\n✓ Loaded {len(df)} cities from Copenhagenize Index 2025") + + # Sample 15 cities across the spectrum + sampled = pd.concat([ + df.head(5), # Top performers + df.iloc[12:17], # Middle + df.iloc[25:30] # Lower + ]).reset_index(drop=True) + + # Generate mock indices + sampled = generate_mock_indices(sampled) + + print(f"\n✓ Generated mock indices for {len(sampled)} cities\n") + print(sampled[['city', 'score', 'altitude_index', 'distance_index']].to_string(index=False)) + + # Test Hypothesis 1: Lower A_i = Better for cycling + print("\n" + "="*70) + print("HYPOTHESIS 1: Lower A_i = Better for Cycling") + print("="*70) + + corr_altitude, p_altitude = pearsonr(sampled['altitude_index'], sampled['score']) + print(f"\nPearson correlation: {corr_altitude:.3f} (p-value: {p_altitude:.4f})") + + if corr_altitude < -0.3 and p_altitude < 0.05: + print("✓ HYPOTHESIS SUPPORTED: Significant negative correlation") + else: + print("~ Result: Check with real data") + + # Test Hypothesis 2: D_i closer to 1 = Better for cycling + print("\n" + "="*70) + print("HYPOTHESIS 2: D_i Closer to 1 = Better for Cycling") + print("="*70) + + sampled['distance_from_optimal'] = abs(1 - sampled['distance_index']) + corr_distance, p_distance = pearsonr(sampled['distance_from_optimal'], sampled['score']) + print(f"\nPearson correlation: {corr_distance:.3f} (p-value: {p_distance:.4f})") + + if corr_distance < -0.3 and p_distance < 0.05: + print("✓ HYPOTHESIS SUPPORTED: Significant negative correlation") + else: + print("~ Result: Check with real data") + + # Create visualization + print("\n" + "="*70) + print("Creating Visualizations") + print("="*70) + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # Plot 1: Altitude Index + sns.scatterplot(data=sampled, x='altitude_index', y='score', s=100, alpha=0.7, ax=ax1) + z = np.polyfit(sampled['altitude_index'], sampled['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(sampled['altitude_index'].min(), sampled['altitude_index'].max(), 100) + ax1.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2) + ax1.set_xlabel('Altitude Index (A_i)', fontsize=12, fontweight='bold') + ax1.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax1.set_title('Hypothesis 1: Lower A_i = Better Cycling', fontsize=13, fontweight='bold') + ax1.grid(True, alpha=0.3) + + # Plot 2: Distance Index + sns.scatterplot(data=sampled, x='distance_index', y='score', s=100, alpha=0.7, ax=ax2) + ax2.axvline(x=1, color='green', linestyle=':', linewidth=2, alpha=0.5, label='Optimal (D_i=1)') + z = np.polyfit(sampled['distance_index'], sampled['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(sampled['distance_index'].min(), sampled['distance_index'].max(), 100) + ax2.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2) + ax2.set_xlabel('Distance Index (D_i)', fontsize=12, fontweight='bold') + ax2.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax2.set_title('Hypothesis 2: D_i Closer to 1 = Better Cycling', fontsize=13, fontweight='bold') + ax2.legend() + ax2.grid(True, alpha=0.3) + + plt.tight_layout() + + # Save plot + os.makedirs('../results', exist_ok=True) + output_path = '../results/demo_results.png' + plt.savefig(output_path, dpi=300, bbox_inches='tight') + print(f"\n✓ Saved visualization to: {output_path}") + + # Save data + output_csv = '../results/demo_cities_with_indices.csv' + sampled.to_csv(output_csv, index=False) + print(f"✓ Saved data to: {output_csv}") + + plt.show() + + print("\n" + "="*70) + print("DEMO COMPLETE") + print("="*70) + print("\nNOTE: This demo uses mock data for demonstration.") + print("Run prediction_platform.py for analysis with real geographic data.") + print("\nNext steps:") + print(" 1. Install OSMnx: pip install osmnx") + print(" 2. Run: python prediction_platform.py") + print(" 3. Wait for real data calculation (10-30 minutes)") + + +if __name__ == "__main__": + demo_hypothesis_testing() diff --git a/analysis/prediction_platform.py b/analysis/prediction_platform.py new file mode 100644 index 0000000..dfae513 --- /dev/null +++ b/analysis/prediction_platform.py @@ -0,0 +1,481 @@ +""" +Prediction Platform for Testing Altitude and Distance Index Hypotheses + +This script tests the following hypotheses using the Copenhagenize Index 2025: +1. The lower the A_i (altitude index), the better for cycling +2. The closer to 1 the D_i (distance index), the better for cycling + +Data Source: The Global Ranking of Bicycle-Friendly Cities (Copenhagenize Index) +https://copenhagenizeindex.eu/ +Edition: 2025 (EIT Urban Mobility Edition) +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy.stats import pearsonr, spearmanr +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +import os +import sys +import signal +import time +from contextlib import contextmanager + +# Add parent directory to path to import bikenv +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from scripts.calculate_indices import calculate_indices_for_city + + +class TimeoutException(Exception): + """Exception raised when operation times out""" + pass + + +@contextmanager +def time_limit(seconds): + """Context manager to limit execution time""" + def signal_handler(signum, frame): + raise TimeoutException(f"Timed out after {seconds} seconds") + + # Set the signal handler and alarm + signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(seconds) + try: + yield + finally: + signal.alarm(0) # Disable the alarm + + +def load_bicycle_index_data(filepath: str) -> pd.DataFrame: + """Load the Copenhagenize Bicycle Cities Index data.""" + # Ensure the filepath is absolute based on the script's location + filepath = os.path.join(os.path.dirname(__file__), '../data/copenhagenize_index_2025.csv') + df = pd.read_csv(filepath) + print(f"Loaded {len(df)} cities from the index") + return df + + +def calculate_indices_for_cities(df: pd.DataFrame, sample_size: int = 15) -> pd.DataFrame: + """ + Calculate A_i and D_i for a sample of cities from the index. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with city information + sample_size : int + Number of cities to sample (default: 15 for computational efficiency) + + Returns + ------- + pd.DataFrame + DataFrame with added altitude_index and distance_index columns + """ + # Sample cities from different score ranges to get good distribution + # Top performers, middle performers, and lower performers + top_cities = df.head(5) + middle_cities = df.iloc[12:17] # Around rank 13-17 + lower_cities = df.iloc[25:30] # Around rank 26-30 + + sampled_df = pd.concat([top_cities, middle_cities, lower_cities]).reset_index(drop=True) + + total_cities = len(sampled_df) + print(f"\nCalculating indices for {total_cities} cities...", flush=True) + print("This may take several minutes...", flush=True) + print("(Cities with very large areas will be skipped automatically)", flush=True) + print(f"\n{'='*70}", flush=True) + + altitude_indices = [] + distance_indices = [] + successful_count = 0 + failed_cities = [] + + for idx, row in sampled_df.iterrows(): + city = row['city'] + country = row['country'] + city_num = idx + 1 + + print(f"\n[{city_num}/{total_cities}] Processing: {city}, {country}", flush=True) + start_time = time.time() + + try: + # Set timeout to 5 minutes per city (300 seconds) + # Cities like Québec that take too long will be skipped + print(f" → Downloading network data...", flush=True) + with time_limit(300): + a_i, d_i = calculate_indices_for_city(city, country) + elapsed = time.time() - start_time + + if a_i is not None and d_i is not None: + altitude_indices.append(a_i) + distance_indices.append(d_i) + a_i_str = f"{a_i:.3f}" + d_i_str = f"{d_i:.3f}" + print(f" ✓ SUCCESS: A_i={a_i_str}, D_i={d_i_str} (took {elapsed:.1f}s)", flush=True) + successful_count += 1 + else: + elapsed = time.time() - start_time + print(f" ✗ FAILED: Calculation returned None (took {elapsed:.1f}s)", flush=True) + altitude_indices.append(None) + distance_indices.append(None) + failed_cities.append(f"{city} (returned None)") + + except TimeoutException as e: + elapsed = time.time() - start_time + print(f" ✗ SKIPPED: Area too large, would take >5 minutes (stopped at {elapsed:.1f}s)", flush=True) + altitude_indices.append(None) + distance_indices.append(None) + failed_cities.append(f"{city} (timeout)") + + except KeyboardInterrupt: + elapsed = time.time() - start_time + print(f"\n ⚠ INTERRUPTED by user at {elapsed:.1f}s", flush=True) + print(f"\nStopping analysis. Processed {successful_count}/{city_num} cities so far.", flush=True) + # Add None for remaining cities + remaining = len(sampled_df) - len(altitude_indices) + altitude_indices.extend([None] * remaining) + distance_indices.extend([None] * remaining) + break + + except Exception as e: + elapsed = time.time() - start_time + error_msg = str(e) + if "900 times your configured" in error_msg: + print(f" ✗ SKIPPED: Area too large for Overpass API (at {elapsed:.1f}s)", flush=True) + failed_cities.append(f"{city} (area too large)") + else: + print(f" ✗ ERROR: {error_msg[:100]} (at {elapsed:.1f}s)", flush=True) + failed_cities.append(f"{city} ({type(e).__name__})") + altitude_indices.append(None) + distance_indices.append(None) + + sampled_df['altitude_index'] = altitude_indices + sampled_df['distance_index'] = distance_indices + + # Remove cities where calculation failed + original_count = len(sampled_df) + sampled_df = sampled_df.dropna(subset=['altitude_index', 'distance_index']) + + print(f"\n{'='*70}", flush=True) + print(f"SUMMARY: Successfully calculated indices for {len(sampled_df)}/{original_count} cities", flush=True) + + if failed_cities: + print(f"\nSkipped cities ({len(failed_cities)}):", flush=True) + for city in failed_cities: + print(f" - {city}", flush=True) + print(f"{'='*70}\n", flush=True) + + return sampled_df + + +def test_altitude_hypothesis(df: pd.DataFrame) -> dict: + """ + Test Hypothesis 1: Lower A_i = better for cycling + + Expected: Negative correlation between altitude_index and score + """ + print("\n" + "="*70) + print("HYPOTHESIS 1: The lower the A_i, the better for cycling") + print("="*70) + + # Calculate correlations + pearson_corr, pearson_p = pearsonr(df['altitude_index'], df['score']) + spearman_corr, spearman_p = spearmanr(df['altitude_index'], df['score']) + + # Linear regression + X = df[['altitude_index']].values + y = df['score'].values + + model = LinearRegression() + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + + print(f"\nCorrelation Analysis:") + print(f" Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p:.4f})") + print(f" Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p:.4f})") + print(f"\nLinear Regression:") + print(f" R² score: {r2:.3f}") + print(f" Slope: {model.coef_[0]:.3f}") + print(f" Intercept: {model.intercept_:.3f}") + + # Interpret results + print(f"\nInterpretation:") + if pearson_corr < -0.3 and pearson_p < 0.05: + print(f" ✓ HYPOTHESIS SUPPORTED: Significant negative correlation found") + print(f" Lower altitude index is associated with higher bicycle scores") + elif pearson_corr < 0 and pearson_p < 0.05: + print(f" ~ HYPOTHESIS PARTIALLY SUPPORTED: Weak negative correlation") + elif pearson_p >= 0.05: + print(f" ✗ HYPOTHESIS NOT SIGNIFICANT: No statistically significant relationship") + else: + print(f" ✗ HYPOTHESIS NOT SUPPORTED: Positive or no correlation found") + + return { + 'pearson_r': pearson_corr, + 'pearson_p': pearson_p, + 'spearman_r': spearman_corr, + 'spearman_p': spearman_p, + 'r2': r2, + 'slope': model.coef_[0], + 'intercept': model.intercept_ + } + + +def test_distance_hypothesis(df: pd.DataFrame) -> dict: + """ + Test Hypothesis 2: D_i closer to 1 = better for cycling + + Expected: Correlation between (1 - abs(1 - D_i)) and score + """ + print("\n" + "="*70) + print("HYPOTHESIS 2: The closer to 1 the D_i, the better for cycling") + print("="*70) + + # Calculate "closeness to 1" metric + df['distance_to_optimal'] = abs(1 - df['distance_index']) + + # Calculate correlations (negative correlation expected with distance from 1) + pearson_corr, pearson_p = pearsonr(df['distance_to_optimal'], df['score']) + spearman_corr, spearman_p = spearmanr(df['distance_to_optimal'], df['score']) + + # Linear regression + X = df[['distance_to_optimal']].values + y = df['score'].values + + model = LinearRegression() + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + + print(f"\nCorrelation Analysis:") + print(f" Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p:.4f})") + print(f" Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p:.4f})") + print(f"\nLinear Regression:") + print(f" R² score: {r2:.3f}") + print(f" Slope: {model.coef_[0]:.3f}") + print(f" Intercept: {model.intercept_:.3f}") + + # Interpret results + print(f"\nInterpretation:") + if pearson_corr < -0.3 and pearson_p < 0.05: + print(f" ✓ HYPOTHESIS SUPPORTED: Significant negative correlation found") + print(f" D_i values closer to 1 are associated with higher bicycle scores") + elif pearson_corr < 0 and pearson_p < 0.05: + print(f" ~ HYPOTHESIS PARTIALLY SUPPORTED: Weak negative correlation") + elif pearson_p >= 0.05: + print(f" ✗ HYPOTHESIS NOT SIGNIFICANT: No statistically significant relationship") + else: + print(f" ✗ HYPOTHESIS NOT SUPPORTED: Positive or no correlation found") + + return { + 'pearson_r': pearson_corr, + 'pearson_p': pearson_p, + 'spearman_r': spearman_corr, + 'spearman_p': spearman_p, + 'r2': r2, + 'slope': model.coef_[0], + 'intercept': model.intercept_ + } + + +def create_visualizations(df: pd.DataFrame, output_dir: str = '../results'): + """Create visualization plots for the analysis.""" + print("\n" + "="*70) + print("Creating Visualizations") + print("="*70) + + os.makedirs(output_dir, exist_ok=True) + + # Set style + sns.set_style("whitegrid") + plt.rcParams['figure.figsize'] = (14, 10) + + # Create a 2x2 subplot figure + fig, axes = plt.subplots(2, 2, figsize=(16, 12)) + + # Plot 1: Altitude Index vs Score + ax1 = axes[0, 0] + sns.scatterplot(data=df, x='altitude_index', y='score', s=100, alpha=0.7, ax=ax1) + + # Add regression line + z = np.polyfit(df['altitude_index'], df['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(df['altitude_index'].min(), df['altitude_index'].max(), 100) + ax1.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2, label='Linear fit') + + ax1.set_xlabel('Altitude Index (A_i)', fontsize=12, fontweight='bold') + ax1.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax1.set_title('Hypothesis 1: Altitude Index vs Cycling Performance', fontsize=14, fontweight='bold') + ax1.legend() + + # Add text annotations for some cities + for idx, row in df.iterrows(): + if idx % 3 == 0: # Annotate every 3rd city to avoid crowding + ax1.annotate(row['city'], (row['altitude_index'], row['score']), + xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7) + + # Plot 2: Distance Index vs Score + ax2 = axes[0, 1] + sns.scatterplot(data=df, x='distance_index', y='score', s=100, alpha=0.7, ax=ax2) + + # Add regression line + z = np.polyfit(df['distance_index'], df['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(df['distance_index'].min(), df['distance_index'].max(), 100) + ax2.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2, label='Linear fit') + + # Add vertical line at D_i = 1 (optimal) + ax2.axvline(x=1, color='green', linestyle=':', linewidth=2, alpha=0.5, label='Optimal (D_i=1)') + + ax2.set_xlabel('Distance Index (D_i)', fontsize=12, fontweight='bold') + ax2.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax2.set_title('Hypothesis 2: Distance Index vs Cycling Performance', fontsize=14, fontweight='bold') + ax2.legend() + + # Plot 3: Distance from Optimal (|1 - D_i|) vs Score + ax3 = axes[1, 0] + df['distance_to_optimal'] = abs(1 - df['distance_index']) + sns.scatterplot(data=df, x='distance_to_optimal', y='score', s=100, alpha=0.7, ax=ax3) + + # Add regression line + z = np.polyfit(df['distance_to_optimal'], df['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(df['distance_to_optimal'].min(), df['distance_to_optimal'].max(), 100) + ax3.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2, label='Linear fit') + + ax3.set_xlabel('Distance from Optimal (|1 - D_i|)', fontsize=12, fontweight='bold') + ax3.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax3.set_title('Distance from Optimal D_i vs Cycling Performance', fontsize=14, fontweight='bold') + ax3.legend() + + # Plot 4: Combined heatmap showing relationships + ax4 = axes[1, 1] + + # Create correlation matrix + corr_data = df[['altitude_index', 'distance_index', 'distance_to_optimal', 'score']].corr() + + sns.heatmap(corr_data, annot=True, fmt='.3f', cmap='coolwarm', center=0, + square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax4) + ax4.set_title('Correlation Matrix', fontsize=14, fontweight='bold') + + plt.tight_layout() + + # Save figure + output_path = os.path.join(output_dir, 'hypothesis_testing_results.png') + plt.savefig(output_path, dpi=300, bbox_inches='tight') + print(f"\n✓ Saved visualization to: {output_path}") + + # Also save individual plots for the README + fig2, ax = plt.subplots(1, 1, figsize=(10, 6)) + sns.scatterplot(data=df, x='altitude_index', y='score', s=150, alpha=0.7) + z = np.polyfit(df['altitude_index'], df['score'], 1) + p = np.poly1d(z) + x_line = np.linspace(df['altitude_index'].min(), df['altitude_index'].max(), 100) + ax.plot(x_line, p(x_line), "r--", alpha=0.8, linewidth=2) + ax.set_xlabel('Altitude Index (A_i)', fontsize=12, fontweight='bold') + ax.set_ylabel('Bicycle Cities Index Score', fontsize=12, fontweight='bold') + ax.set_title('Altitude Index vs Cycling Performance', fontsize=14, fontweight='bold') + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'altitude_index_plot.png'), dpi=300, bbox_inches='tight') + plt.close() + + print(f"✓ Saved individual plot: altitude_index_plot.png") + + plt.show() + + +def save_results(df: pd.DataFrame, altitude_results: dict, distance_results: dict, + output_dir: str = '../results'): + """Save analysis results to CSV files.""" + print("\n" + "="*70) + print("Saving Results") + print("="*70) + + os.makedirs(output_dir, exist_ok=True) + + # Save city data with calculated indices + output_path = os.path.join(output_dir, 'cities_with_indices.csv') + df.to_csv(output_path, index=False) + print(f"\n✓ Saved city data to: {output_path}") + + # Save statistical results + results_summary = { + 'Hypothesis': ['Altitude Index (Lower is better)', 'Distance Index (Closer to 1 is better)'], + 'Pearson_r': [altitude_results['pearson_r'], distance_results['pearson_r']], + 'Pearson_p': [altitude_results['pearson_p'], distance_results['pearson_p']], + 'Spearman_r': [altitude_results['spearman_r'], distance_results['spearman_r']], + 'Spearman_p': [altitude_results['spearman_p'], distance_results['spearman_p']], + 'R2_score': [altitude_results['r2'], distance_results['r2']], + 'Slope': [altitude_results['slope'], distance_results['slope']], + 'Intercept': [altitude_results['intercept'], distance_results['intercept']] + } + + results_df = pd.DataFrame(results_summary) + output_path = os.path.join(output_dir, 'statistical_results.csv') + results_df.to_csv(output_path, index=False) + print(f"✓ Saved statistical results to: {output_path}") + + +def main(): + """Main execution function.""" + print("="*70, flush=True) + print("BIKENV PREDICTION PLATFORM", flush=True) + print("Testing Altitude and Distance Index Hypotheses", flush=True) + print("Data: Copenhagenize Index 2025 Edition", flush=True) + print("="*70, flush=True) + + try: + # Load data + data_path = '../data/copenhagenize_index_2025.csv' + print(f"\nLoading data from: {data_path}", flush=True) + df = load_bicycle_index_data(data_path) + print(f"✓ Loaded {len(df)} cities from index", flush=True) + + # Calculate indices for sampled cities + df_with_indices = calculate_indices_for_cities(df, sample_size=15) + + # Check if we have enough data to proceed + if len(df_with_indices) < 5: + print("\n⚠ ERROR: Not enough cities calculated successfully.", flush=True) + print(f"Need at least 5 cities, got {len(df_with_indices)}", flush=True) + print("Cannot perform statistical analysis.", flush=True) + return + + print(f"\nProceeding with analysis using {len(df_with_indices)} cities...", flush=True) + + # Test hypotheses + print("\n" + "="*70, flush=True) + print("TESTING HYPOTHESES", flush=True) + print("="*70, flush=True) + altitude_results = test_altitude_hypothesis(df_with_indices) + distance_results = test_distance_hypothesis(df_with_indices) + + # Create visualizations + create_visualizations(df_with_indices) + + # Save results + save_results(df_with_indices, altitude_results, distance_results) + + print("\n" + "="*70) + print("ANALYSIS COMPLETE") + print("="*70) + print("\nResults and visualizations have been saved to the 'results/' directory.") + print("Review the plots and statistical summaries to evaluate the hypotheses.") + + except KeyboardInterrupt: + print("\n\n⚠ Analysis interrupted by user.") + print("Partial results may be available in the results/ directory.") + + except Exception as e: + print(f"\n\n❌ ERROR: {e}") + print("Analysis could not be completed.") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/bikenv/__init__.py b/bikenv/__init__.py deleted file mode 100644 index 38b48e1..0000000 --- a/bikenv/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from ._api import * \ No newline at end of file diff --git a/bikenv/_api.py b/bikenv/_api.py deleted file mode 100644 index f0969c7..0000000 --- a/bikenv/_api.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Expose most common parts of public API directly in `bikenv.` namespace.""" - -from .module import get_region -from .module import altitude_index -from .module import distance_index \ No newline at end of file diff --git a/bikenv/module.py b/bikenv/module.py deleted file mode 100644 index 625483e..0000000 --- a/bikenv/module.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Hola.""" - -import osmnx as ox -import networkx as nx - - -def get_region(name, network_type="drive"): - """Get the region from OSM as a graph. - - Parameters - ---------- - region : string - The name of the region to get from OSM - network_type : string - The type of network to get from OSM. Default is 'drive' - - Returns - ------- - road_network : networkx multidigraph - """ - road_network = "Hola" - return road_network - - -def altitude_index(G, google_key): - """Calculate the index of a graph based on the altitude of the nodes. - - Parameters - ---------- - G : networkx multidigraph - The graph to calculate the index - google_key : string - The key to use the Google Elevation API - - Returns - ------- - index : float - The index of the graph - """ - index = 24.5 - return index - - -def distance_index(G): - """Calculate the index of a graph based on the distance of the nodes. - - Parameters - ---------- - G : networkx multidigraph - The graph to calculate the index - - Returns - ------- - index : float - - """ - index = 1849 - return index diff --git a/data/copenhagenize_index_2025.csv b/data/copenhagenize_index_2025.csv new file mode 100644 index 0000000..10efc40 --- /dev/null +++ b/data/copenhagenize_index_2025.csv @@ -0,0 +1,31 @@ +rank,city,country,score +1,Utrecht,Netherlands,71.1 +2,Copenhagen,Denmark,70.8 +3,Ghent,Belgium,67.6 +4,Amsterdam,Netherlands,66.6 +5,Paris,France,65.0 +6,Helsinki,Finland,64.9 +7,Münster,Germany,64.7 +8,Antwerp,Belgium,64.4 +9,Bordeaux,France,62.9 +10,Nantes,France,62.8 +11,Bonn,Germany,61.4 +12,The Hague,Netherlands,61.0 +13,Strasbourg,France,60.3 +14,Lyon,France,58.9 +15,Montréal,Canada,58.3 +16,Malmö,Sweden,57.7 +17,Munich,Germany,57.6 +18,Oslo,Norway,57.2 +19,Vienna,Austria,56.7 +20,Bern,Switzerland,56.4 +21,Graz,Austria,55.8 +22,Zurich,Switzerland,55.7 +23,Rotterdam,Netherlands,55.1 +24,Ljubljana,Slovenia,54.6 +25,Bologna,Italy,54.4 +26,Stockholm,Sweden,53.4 +27,Vitoria-Gasteiz,Spain,52.2 +28,Wroclaw,Poland,51.3 +29,Québec,Canada,51.1 +30,Vancouver,Canada,50.3 diff --git a/requirements-platform.txt b/requirements-platform.txt new file mode 100644 index 0000000..17a91fe --- /dev/null +++ b/requirements-platform.txt @@ -0,0 +1,11 @@ +# Requirements for bikenv prediction platform +pandas>=2.0.0 +numpy>=1.24.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +scipy>=1.10.0 +requests>=2.31.0 +osmnx>=1.9.0 +networkx>=3.0 +geopandas>=0.14.0 +scikit-learn>=1.3.0 diff --git a/results/altitude_index_plot.png b/results/altitude_index_plot.png new file mode 100644 index 0000000..297ea64 Binary files /dev/null and b/results/altitude_index_plot.png differ diff --git a/results/cities_with_indices.csv b/results/cities_with_indices.csv new file mode 100644 index 0000000..dc31849 --- /dev/null +++ b/results/cities_with_indices.csv @@ -0,0 +1,14 @@ +rank,city,country,score,altitude_index,distance_index,distance_to_optimal +1,Utrecht,Netherlands,71.1,1.6531753747174007,1.060795729919409,0.06079572991940907 +2,Copenhagen,Denmark,70.8,2.2123228295321105,1.0176648339980587,0.017664833998058738 +3,Ghent,Belgium,67.6,1.7781695472164336,1.0783669613539206,0.07836696135392063 +4,Amsterdam,Netherlands,66.6,1.7509848798542698,1.055778059995735,0.05577805999573493 +5,Paris,France,65.0,5.103024044503198,1.0260727439171888,0.026072743917188834 +13,Strasbourg,France,60.3,4.174544714256035,1.0756833983843932,0.0756833983843932 +14,Lyon,France,58.9,5.50919771023399,1.0555731387669123,0.05557313876691228 +15,Montréal,Canada,58.3,4.089636886920087,1.074643614231217,0.07464361423121701 +16,Malmö,Sweden,57.7,2.603019586527989,1.065535672453737,0.06553567245373704 +17,Munich,Germany,57.6,3.660945712727582,1.0443286290169542,0.044328629016954224 +26,Stockholm,Sweden,53.4,4.743498314416578,1.0772477586207407,0.07724775862074074 +27,Vitoria-Gasteiz,Spain,52.2,4.381077528662655,1.0769575210421953,0.07695752104219533 +28,Wroclaw,Poland,51.3,4.043849823683032,1.0643413995907665,0.06434139959076646 diff --git a/results/hypothesis_testing_results.png b/results/hypothesis_testing_results.png new file mode 100644 index 0000000..b035f26 Binary files /dev/null and b/results/hypothesis_testing_results.png differ diff --git a/results/statistical_results.csv b/results/statistical_results.csv new file mode 100644 index 0000000..4ed260e --- /dev/null +++ b/results/statistical_results.csv @@ -0,0 +1,3 @@ +Hypothesis,Pearson_r,Pearson_p,Spearman_r,Spearman_p,R2_score,Slope,Intercept +Altitude Index (Lower is better),-0.6568626722894959,0.014724087361700906,-0.521978021978022,0.06729171199351018,0.43146857024729746,-3.2957848176932303,72.4175943526749 +Distance Index (Closer to 1 is better),-0.4830863403151897,0.09447081133823827,-0.3131868131868132,0.2974384220710292,0.2333724121991234,-166.42546399901715,70.726548443211 diff --git a/scripts/calculate_indices.py b/scripts/calculate_indices.py new file mode 100644 index 0000000..8874f8b --- /dev/null +++ b/scripts/calculate_indices.py @@ -0,0 +1,189 @@ +""" +Calculate altitude and distance indices for cities. + +The altitude index (A_i) measures the hilliness of a city. +The distance index (D_i) measures the network connectivity/compactness. +""" + +import osmnx as ox +import networkx as nx +import numpy as np +from typing import Tuple, Optional + +# Configure OSMnx to use free Open Topo Data API instead of Google +ox.settings.elevation_url_template = \ + "https://api.opentopodata.org/v1/aster30m?locations={locations}" + + +def calculate_altitude_index(city_name: str, country: Optional[str] = None) -> float: + """ + Calculate the altitude index (A_i) for a city. + + The altitude index quantifies how hilly a city is. Lower values indicate + flatter terrain, which is better for cycling. + + A_i = (mean_elevation_change / mean_edge_length) * 100 + + Where: + - mean_elevation_change: Average elevation difference across road segments + - mean_edge_length: Average length of road segments + + The result is multiplied by 100 to get a percentage-like value. + + Hypothesis: Lower A_i = better for cycling (less climbing required) + + Parameters + ---------- + city_name : str + Name of the city + country : str, optional + Country name to disambiguate cities + + Returns + ------- + float + Altitude index value (lower is better for cycling) + """ + try: + # Construct query + query = f"{city_name}, {country}" if country else city_name + + # Get the road network with elevation data + G = ox.graph_from_place(query, network_type="bike") + + # Add elevation data using free Open Topo Data API (no key needed) + G = ox.add_node_elevations_google(G, api_key=None, batch_size=100, pause=0.5) + + # Calculate elevation changes for each edge + elevation_changes = [] + edge_lengths = [] + + for u, v, data in G.edges(data=True): + if 'length' in data: + # Get elevations of start and end nodes + elev_u = G.nodes[u].get('elevation', 0) + elev_v = G.nodes[v].get('elevation', 0) + + # Calculate absolute elevation change + elev_change = abs(elev_v - elev_u) + elevation_changes.append(elev_change) + edge_lengths.append(data['length']) + + # Calculate mean values + mean_elev_change = np.mean(elevation_changes) + mean_edge_length = np.mean(edge_lengths) + + # Calculate altitude index + if mean_edge_length > 0: + altitude_index = (mean_elev_change / mean_edge_length) * 100 + else: + altitude_index = 0 + + return altitude_index + + except Exception as e: + print(f"Error calculating altitude index for {city_name}: {e}") + return None + + +def calculate_distance_index(city_name: str, country: Optional[str] = None) -> float: + """ + Calculate the distance index (D_i) for a city. + + The distance index measures how connected/compact a city's bike network is. + Values closer to 1 indicate better connectivity. + + D_i = circuity / (1 + normalized_node_density) + + Where: + - circuity: Ratio of network distances to straight-line distances (closer to 1 is better) + - normalized_node_density: Nodes per km² normalized to [0, 1] + + Hypothesis: D_i closer to 1 = better for cycling (more direct routes, better connectivity) + + Parameters + ---------- + city_name : str + Name of the city + country : str, optional + Country name to disambiguate cities + + Returns + ------- + float + Distance index value (closer to 1 is better) + """ + try: + # Construct query + query = f"{city_name}, {country}" if country else city_name + + # Get the road network + G = ox.graph_from_place(query, network_type="bike") + + # Calculate basic stats + stats = ox.basic_stats(G) + + # Get circuity (how direct the routes are) + # Circuity = 1 means perfectly direct routes + circuity = stats.get('circuity_avg', 1.0) + + # Get node density (nodes per km²) + node_density = stats.get('node_density_km', 0) + + # Normalize node density (assuming max typical density of 500 nodes/km²) + max_density = 500 + normalized_density = min(node_density / max_density, 1.0) + + # Calculate distance index + # Lower circuity is better (more direct) + # Higher node density is better (more connected) + distance_index = circuity / (1 + normalized_density) + + return distance_index + + except Exception as e: + print(f"Error calculating distance index for {city_name}: {e}") + return None + + +def calculate_indices_for_city(city_name: str, country: Optional[str] = None) -> Tuple[float, float]: + """ + Calculate both altitude and distance indices for a city. + + Parameters + ---------- + city_name : str + Name of the city + country : str, optional + Country name to disambiguate cities + + Returns + ------- + tuple + (altitude_index, distance_index) + """ + print(f"Calculating indices for {city_name}...") + + altitude_idx = calculate_altitude_index(city_name, country) + + # If altitude calculation failed, don't attempt distance calculation + # (likely same underlying issue - area too large, network problem, etc.) + if altitude_idx is None: + print(f" Skipping distance calculation for {city_name} (altitude calculation failed)") + return None, None + + distance_idx = calculate_distance_index(city_name, country) + + return altitude_idx, distance_idx + + +if __name__ == "__main__": + # Test with a sample city + test_city = "Amsterdam" + test_country = "Netherlands" + + a_i, d_i = calculate_indices_for_city(test_city, test_country) + + print(f"\n{test_city} Results:") + print(f"Altitude Index (A_i): {a_i:.3f}") + print(f"Distance Index (D_i): {d_i:.3f}") diff --git a/scripts/retrieve_data.py b/scripts/retrieve_data.py new file mode 100644 index 0000000..22550bc --- /dev/null +++ b/scripts/retrieve_data.py @@ -0,0 +1,183 @@ +""" +Data Retrieval Script for Copenhagenize Index + +This script scrapes the latest Copenhagenize Index data from their official website. + +The Copenhagenize Index (official name: "The Global Ranking of Bicycle-Friendly Cities") +is published by Copenhagenize Design Company and EIT Urban Mobility. + +Source: https://copenhagenizeindex.eu/ +""" + +import re +import csv +import os +from typing import List, Dict + + +def get_copenhagenize_data_manual() -> List[Dict[str, str]]: + """ + Manual data entry from Copenhagenize Index 2025. + + NOTE: This is a temporary solution. For automated scraping, we would need: + - requests + BeautifulSoup for HTML parsing + - Or Selenium for JavaScript-rendered content + - Proper error handling and rate limiting + + Source: https://copenhagenizeindex.eu/ + Last updated: December 2025 + Edition: 2025 (EIT Urban Mobility Edition) + """ + + # Data extracted from https://copenhagenizeindex.eu/ on December 27, 2025 + # This is the Top 30 from the 2025 edition + cities_data = [ + {"rank": 1, "city": "Utrecht", "country": "Netherlands", "score": 71.1}, + {"rank": 2, "city": "Copenhagen", "country": "Denmark", "score": 70.8}, + {"rank": 3, "city": "Ghent", "country": "Belgium", "score": 67.6}, + {"rank": 4, "city": "Amsterdam", "country": "Netherlands", "score": 66.6}, + {"rank": 5, "city": "Paris", "country": "France", "score": 65.0}, + {"rank": 6, "city": "Helsinki", "country": "Finland", "score": 64.9}, + {"rank": 7, "city": "Münster", "country": "Germany", "score": 64.7}, + {"rank": 8, "city": "Antwerp", "country": "Belgium", "score": 64.4}, + {"rank": 9, "city": "Bordeaux", "country": "France", "score": 62.9}, + {"rank": 10, "city": "Nantes", "country": "France", "score": 62.8}, + {"rank": 11, "city": "Bonn", "country": "Germany", "score": 61.4}, + {"rank": 12, "city": "The Hague", "country": "Netherlands", "score": 61.0}, + {"rank": 13, "city": "Strasbourg", "country": "France", "score": 60.3}, + {"rank": 14, "city": "Lyon", "country": "France", "score": 58.9}, + {"rank": 15, "city": "Montréal", "country": "Canada", "score": 58.3}, + {"rank": 16, "city": "Malmö", "country": "Sweden", "score": 57.7}, + {"rank": 17, "city": "Munich", "country": "Germany", "score": 57.6}, + {"rank": 18, "city": "Oslo", "country": "Norway", "score": 57.2}, + {"rank": 19, "city": "Vienna", "country": "Austria", "score": 56.7}, + {"rank": 20, "city": "Bern", "country": "Switzerland", "score": 56.4}, + {"rank": 21, "city": "Graz", "country": "Austria", "score": 55.8}, + {"rank": 22, "city": "Zurich", "country": "Switzerland", "score": 55.7}, + {"rank": 23, "city": "Rotterdam", "country": "Netherlands", "score": 55.1}, + {"rank": 24, "city": "Ljubljana", "country": "Slovenia", "score": 54.6}, + {"rank": 25, "city": "Bologna", "country": "Italy", "score": 54.4}, + {"rank": 26, "city": "Stockholm", "country": "Sweden", "score": 53.4}, + {"rank": 27, "city": "Vitoria-Gasteiz", "country": "Spain", "score": 52.2}, + {"rank": 28, "city": "Wroclaw", "country": "Poland", "score": 51.3}, + {"rank": 29, "city": "Québec", "country": "Canada", "score": 51.1}, + {"rank": 30, "city": "Vancouver", "country": "Canada", "score": 50.3}, + ] + + return cities_data + + +def save_to_csv(data: List[Dict], output_file: str = "../data/copenhagenize_index_2025.csv"): + """Save the data to CSV format.""" + + if not data: + print("No data to save") + return + + # Correct the output file path to use the correct directory within the bikenv project + output_file = os.path.join(os.path.dirname(__file__), '../data/copenhagenize_index_2025.csv') + + # Ensure the directory exists + os.makedirs(os.path.dirname(output_file), exist_ok=True) + + fieldnames = ["rank", "city", "country", "score"] + + with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(data) + + print(f"✓ Saved {len(data)} cities to {output_file}") + + +def display_data_info(data: List[Dict]): + """Display information about the retrieved data.""" + + print("="*70) + print("COPENHAGENIZE INDEX 2025 - Data Retrieved") + print("="*70) + + print(f"\nTotal cities: {len(data)}") + print(f"Top city: {data[0]['city']} ({data[0]['country']}) - Score: {data[0]['score']}") + print(f"Last city: {data[-1]['city']} ({data[-1]['country']}) - Score: {data[-1]['score']}") + + # Country distribution + countries = {} + for city in data: + country = city['country'] + countries[country] = countries.get(country, 0) + 1 + + print(f"\nCountries represented: {len(countries)}") + print("\nTop countries by number of cities:") + sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True) + for country, count in sorted_countries[:5]: + print(f" {country}: {count} cities") + + print("\n" + "="*70) + + +def main(): + """Main execution function.""" + + print("="*70) + print("COPENHAGENIZE INDEX - Data Retrieval") + print("="*70) + print("\nSource: https://copenhagenizeindex.eu/") + print("Edition: 2025 (EIT Urban Mobility Edition)") + print("Method: Manual entry (Top 30 cities)") + print("\nNOTE: For automated scraping, install: requests, beautifulsoup4") + print("="*70) + + # Get data + print("\n✓ Retrieving data...") + data = get_copenhagenize_data_manual() + + # Display info + display_data_info(data) + + # Save to CSV + print("\n✓ Saving to CSV...") + save_to_csv(data) + + print("\n✓ Data retrieval complete!") + print("\nNext steps:") + print(" 1. Review: cat ../data/copenhagenize_index_2025.csv") + print(" 2. Update analysis scripts to use 2025 data") + print(" 3. Run: python3 ../analysis/prediction_platform.py") + + +if __name__ == "__main__": + main() + + +""" +FUTURE IMPROVEMENTS: + +For automated web scraping, add these dependencies: + pip install requests beautifulsoup4 selenium + +Example implementation: + +import requests +from bs4 import BeautifulSoup + +def scrape_copenhagenize_index(): + url = "https://copenhagenizeindex.eu/" + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + # Find city elements (requires inspecting HTML structure) + cities = soup.find_all('div', class_='city-item') # Example selector + + data = [] + for city in cities: + rank = city.find('span', class_='rank').text + name = city.find('h3', class_='city-name').text + score = city.find('span', class_='score').text + # ... parse and structure data + + return data + +Note: The actual selectors depend on the website's HTML structure. +The site may use JavaScript rendering, requiring Selenium instead. +""" diff --git a/setup.py b/setup.py deleted file mode 100644 index 084d9c6..0000000 --- a/setup.py +++ /dev/null @@ -1,32 +0,0 @@ -from setuptools import setup, find_packages - -VERSION = '0.0.1' -DESCRIPTION = 'Quantifies certain environmental factors that affect cycling' -LONG_DESCRIPTION = """ -bikenv (biking environment) is intended to be used by researchers to quantify some environmental factors that affect cycling for a given region. -""" - - -setup( - name='bikenv', - packages=find_packages(include=['bikenv']), - version=VERSION, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - url='https://bikenv.readthedocs.io/en/latest/index.html', - author='Fabián Abarca & Jose Daniel Marín', - license='MIT', - install_requires=[ - 'numpy', - 'scipy', - 'pandas', - 'geopandas', - 'osmnx', - 'networkx', - ], - classifiers=[ - 'Development Status :: 1 - Planning', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - ], -)