121 KiB
Лабораторная 1. Интерактивный анализ данных велопарковок SF Bay Area Bike Share в Apache Spark с использованием Spark SQL и DataFrame API¶
Описание данных¶
https://www.kaggle.com/benhamner/sf-bay-area-bike-share
stations.csv схема:
id: station ID number
name: name of station
lat: latitude
long: longitude
dock_count: number of total docks at station
city: city (San Francisco, Redwood City, Palo Alto, Mountain View, San Jose)
installation_date: original date that station was installed. If station was moved, it is noted below.
trips.csv схема:
id: numeric ID of bike trip
duration: time of trip in seconds
start_date: start date of trip with date and time, in PST
start_station_name: station name of start station
start_station_id: numeric reference for start station
end_date: end date of trip with date and time, in PST
end_station_name: station name for end station
end_station_id: numeric reference for end station
bike_id: ID of bike used
subscription_type: Subscriber = annual or 30-day member; Customer = 24-hour or 3-day member
zip_code: Home zip code of subscriber (customers can choose to manually enter zip at kiosk however data is unreliable)
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession import pyspark.sql as sql
conf = SparkConf().setAppName("L1_interactive_bike_analysis").setMaster('yarn')
sc = SparkContext(conf=conf) spark = SparkSession(sc)
Пример чтения csv файлов и работы с дефектными данными¶
Список опций чтения и записи для CSV файлов https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option
Формат паттерна временной метки Spark SQL отличается от python библиотеки datetime. https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
tripData = spark.read\ .option("header", True)\ .option("inferSchema", True)\ .option("timestampFormat", 'M/d/y H:m')\ .csv("trips.csv") tripData
DataFrame[id: int, duration: int, start_date: timestamp, start_station_name: string, start_station_id: int, end_date: timestamp, end_station_name: string, end_station_id: int, bike_id: int, subscription_type: string, zip_code: string]
tripData.printSchema()
root |-- id: integer (nullable = true) |-- duration: integer (nullable = true) |-- start_date: timestamp (nullable = true) |-- start_station_name: string (nullable = true) |-- start_station_id: integer (nullable = true) |-- end_date: timestamp (nullable = true) |-- end_station_name: string (nullable = true) |-- end_station_id: integer (nullable = true) |-- bike_id: integer (nullable = true) |-- subscription_type: string (nullable = true) |-- zip_code: string (nullable = true)
tripData.show(n=5)
+----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ | id|duration| start_date| start_station_name|start_station_id| end_date| end_station_name|end_station_id|bike_id|subscription_type|zip_code| +----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ |4576| 63| null|South Van Ness at...| 66|2013-08-29 14:14:00|South Van Ness at...| 66| 520| Subscriber| 94127| |4607| null|2013-08-29 14:42:00| San Jose City Hall| 10|2013-08-29 14:43:00| San Jose City Hall| 10| 661| Subscriber| 95138| |4130| 71|2013-08-29 10:16:00|Mountain View Cit...| 27|2013-08-29 10:17:00|Mountain View Cit...| 27| 48| Subscriber| 97214| |4251| 77|2013-08-29 11:29:00| San Jose City Hall| 10|2013-08-29 11:30:00| San Jose City Hall| 10| 26| Subscriber| 95060| |4299| 83|2013-08-29 12:02:00|South Van Ness at...| 66|2013-08-29 12:04:00| Market at 10th| 67| 319| Subscriber| 94103| +----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ only showing top 5 rows
? tripData.dropna
Signature: tripData.dropna(how='any', thresh=None, subset=None) Docstring: Returns a new :class:`DataFrame` omitting rows with null values. :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other. .. versionadded:: 1.3.1 Parameters ---------- how : str, optional 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null. thresh: int, optional default None If specified, drop rows that have less than `thresh` non-null values. This overwrites the `how` parameter. subset : str, tuple or list, optional optional list of column names to consider. Examples -------- >>> df4.na.drop().show() +---+------+-----+ |age|height| name| +---+------+-----+ | 10| 80|Alice| +---+------+-----+ File: ~/.local/lib/python3.9/site-packages/pyspark/sql/dataframe.py Type: method
tripData.dropna().show(n=5)
+----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ | id|duration| start_date| start_station_name|start_station_id| end_date| end_station_name|end_station_id|bike_id|subscription_type|zip_code| +----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ |4130| 71|2013-08-29 10:16:00|Mountain View Cit...| 27|2013-08-29 10:17:00|Mountain View Cit...| 27| 48| Subscriber| 97214| |4251| 77|2013-08-29 11:29:00| San Jose City Hall| 10|2013-08-29 11:30:00| San Jose City Hall| 10| 26| Subscriber| 95060| |4299| 83|2013-08-29 12:02:00|South Van Ness at...| 66|2013-08-29 12:04:00| Market at 10th| 67| 319| Subscriber| 94103| |4927| 103|2013-08-29 18:54:00| Golden Gate at Polk| 59|2013-08-29 18:56:00| Golden Gate at Polk| 59| 527| Subscriber| 94109| |4500| 109|2013-08-29 13:25:00|Santa Clara at Al...| 4|2013-08-29 13:27:00| Adobe on Almaden| 5| 679| Subscriber| 95112| +----+--------+-------------------+--------------------+----------------+-------------------+--------------------+--------------+-------+-----------------+--------+ only showing top 5 rows
tripData.describe().show()
+-------+------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+-----------------+-------------------+ |summary| id| duration| start_station_name| start_station_id| end_station_name| end_station_id| bike_id|subscription_type| zip_code| +-------+------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+-----------------+-------------------+ | count| 985352| 985351| 985352| 985352| 985352| 985352| 985352| 985352| 978484| | mean|521401.35102481145|1092.1337716204682| null|58.060558054380564| null|58.043319544690625|426.07050779822845| null| 2008421.845039644| | stddev|245889.28553182338| 25689.80328664839| null|16.998928220336474| null| 17.10456379871572| 155.4614377036669| null|1.243190730418017E9| | min| 4069| 60| 2nd at Folsom| 2| 2nd at Folsom| 2| 9| Customer| 0| | max| 913460| 17270400|Yerba Buena Cente...| 84|Yerba Buena Cente...| 84| 878| Subscriber| v6z2x| +-------+------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+-----------------+-------------------+
stationData = spark.read\ .option("header", True)\ .option("inferSchema", True)\ .option("timestampFormat", 'M/d/y')\ .csv("stations.csv") stationData.printSchema()
root |-- id: integer (nullable = true) |-- name: string (nullable = true) |-- lat: double (nullable = true) |-- long: double (nullable = true) |-- dock_count: integer (nullable = true) |-- city: string (nullable = true) |-- installation_date: timestamp (nullable = true)
stationData.show(n=5)
+---+--------------------+------------------+-------------------+----------+--------+-------------------+ | id| name| lat| long|dock_count| city| installation_date| +---+--------------------+------------------+-------------------+----------+--------+-------------------+ | 2|San Jose Diridon ...| 37.329732|-121.90178200000001| 27|San Jose|2013-08-06 00:00:00| | 3|San Jose Civic Ce...| 37.330698| -121.888979| 15|San Jose|2013-08-05 00:00:00| | 4|Santa Clara at Al...| 37.333988| -121.894902| 11|San Jose|2013-08-06 00:00:00| | 5| Adobe on Almaden| 37.331415| -121.8932| 19|San Jose|2013-08-05 00:00:00| | 6| San Pedro Square|37.336721000000004| -121.894074| 15|San Jose|2013-08-07 00:00:00| +---+--------------------+------------------+-------------------+----------+--------+-------------------+ only showing top 5 rows
stationData.describe().show()
+-------+------------------+--------------------+-------------------+-------------------+-----------------+-------------+ |summary| id| name| lat| long| dock_count| city| +-------+------------------+--------------------+-------------------+-------------------+-----------------+-------------+ | count| 70| 70| 70| 70| 70| 70| | mean| 43.0| null| 37.59024338428572|-122.21841616428571|17.65714285714286| null| | stddev|24.166091947189145| null|0.20347253639672502|0.20944604979644524|4.010441857493954| null| | min| 2| 2nd at Folsom| 37.329732| -122.418954| 11|Mountain View| | max| 84|Yerba Buena Cente...| 37.80477| -121.877349| 27| San Jose| +-------+------------------+--------------------+-------------------+-------------------+-----------------+-------------+
Пример использования DataFrame API¶
Выполните операцию объединения коллекций по ключу с помощью функции join. Объедините stationsIndexed и tripsByStartTerminals, stationsIndexed и tripsByEndTerminals.
tripData.printSchema() stationData.printSchema()
root |-- id: integer (nullable = true) |-- duration: integer (nullable = true) |-- start_date: timestamp (nullable = true) |-- start_station_name: string (nullable = true) |-- start_station_id: integer (nullable = true) |-- end_date: timestamp (nullable = true) |-- end_station_name: string (nullable = true) |-- end_station_id: integer (nullable = true) |-- bike_id: integer (nullable = true) |-- subscription_type: string (nullable = true) |-- zip_code: string (nullable = true) root |-- id: integer (nullable = true) |-- name: string (nullable = true) |-- lat: double (nullable = true) |-- long: double (nullable = true) |-- dock_count: integer (nullable = true) |-- city: string (nullable = true) |-- installation_date: timestamp (nullable = true)
stationsView = stationData.select(stationData['id'], stationData['name'], stationData['lat'], stationData['long']) stationsView.show()
+---+--------------------+------------------+-------------------+ | id| name| lat| long| +---+--------------------+------------------+-------------------+ | 2|San Jose Diridon ...| 37.329732|-121.90178200000001| | 3|San Jose Civic Ce...| 37.330698| -121.888979| | 4|Santa Clara at Al...| 37.333988| -121.894902| | 5| Adobe on Almaden| 37.331415| -121.8932| | 6| San Pedro Square|37.336721000000004| -121.894074| | 7|Paseo de San Antonio| 37.333798|-121.88694299999999| | 8| San Salvador at 1st| 37.330165|-121.88583100000001| | 9| Japantown| 37.348742|-121.89471499999999| | 10| San Jose City Hall| 37.337391| -121.886995| | 11| MLK Library| 37.335885|-121.88566000000002| | 12|SJSU 4th at San C...| 37.332808|-121.88389099999999| | 13| St James Park| 37.339301|-121.88993700000002| | 14|Arena Green / SAP...| 37.332692| -121.900084| | 16|SJSU - San Salvad...|37.333954999999996| -121.877349| | 21| Franklin at Maple| 37.481758| -122.226904| | 22|Redwood City Calt...|37.486078000000006|-122.23208899999999| | 23|San Mateo County ...|37.487615999999996| -122.229951| | 24|Redwood City Publ...| 37.484219| -122.227424| | 25|Stanford in Redwo...| 37.48537|-122.20328799999999| | 26|Redwood City Medi...| 37.487682| -122.223492| +---+--------------------+------------------+-------------------+ only showing top 20 rows
startTrips = tripData.select(tripData.id, tripData.duration, tripData.start_station_id).withColumnRenamed('id', 'trip_id').join(stationsView, tripData.start_station_id == stationsView.id) startTrips = startTrips.drop('id')
startTrips.show()
+-------+--------+----------------+--------------------+------------------+-------------------+ |trip_id|duration|start_station_id| name| lat| long| +-------+--------+----------------+--------------------+------------------+-------------------+ | 4576| 63| 66|South Van Ness at...| 37.774814| -122.418954| | 4607| null| 10| San Jose City Hall| 37.337391| -121.886995| | 4130| 71| 27|Mountain View Cit...| 37.389218| -122.081896| | 4251| 77| 10| San Jose City Hall| 37.337391| -121.886995| | 4299| 83| 66|South Van Ness at...| 37.774814| -122.418954| | 4927| 103| 59| Golden Gate at Polk| 37.781332| -122.418603| | 4500| 109| 4|Santa Clara at Al...| 37.333988| -121.894902| | 4563| 111| 8| San Salvador at 1st| 37.330165|-121.88583100000001| | 4760| 113| 66|South Van Ness at...| 37.774814| -122.418954| | 4258| 114| 10| San Jose City Hall| 37.337391| -121.886995| | 4549| 125| 49| Spear at Folsom|37.790302000000004|-122.39063700000001| | 4498| 126| 6| San Pedro Square|37.336721000000004| -121.894074| | 4965| 129| 28|Mountain View Cal...|37.394358000000004|-122.07671299999998| | 4557| 130| 64| 2nd at South Park| 37.782259| -122.392738| | 4386| 134| 41| Clay at Battery| 37.795001| -122.39997| | 4749| 138| 47| Post at Kearney| 37.788975| -122.403452| | 4242| 141| 10| San Jose City Hall| 37.337391| -121.886995| | 4329| 142| 67| Market at 10th|37.776619000000004|-122.41738500000001| | 5097| 142| 74| Steuart at Market| 37.794139| -122.394434| | 5084| 144| 39| Powell Street BART|37.783871000000005| -122.408433| +-------+--------+----------------+--------------------+------------------+-------------------+ only showing top 20 rows
tripData.printSchema() stationData.printSchema()
root |-- id: integer (nullable = true) |-- duration: integer (nullable = true) |-- start_date: timestamp (nullable = true) |-- start_station_name: string (nullable = true) |-- start_station_id: integer (nullable = true) |-- end_date: timestamp (nullable = true) |-- end_station_name: string (nullable = true) |-- end_station_id: integer (nullable = true) |-- bike_id: integer (nullable = true) |-- subscription_type: string (nullable = true) |-- zip_code: string (nullable = true) root |-- id: integer (nullable = true) |-- name: string (nullable = true) |-- lat: double (nullable = true) |-- long: double (nullable = true) |-- dock_count: integer (nullable = true) |-- city: string (nullable = true) |-- installation_date: timestamp (nullable = true)
Пример использования Spark SQL API¶
stationData.createOrReplaceTempView("stations") tripData.createOrReplaceTempView("trips")
endTrips = spark.sql(""" SELECT trips.id as trip_id, trips.end_station_id, trips.duration, stations.name as station_name, stations.lat, stations.long FROM trips INNER JOIN stations ON trips.end_station_id==stations.id """)
endTrips.show()
+-------+--------------+--------+--------------------+------------------+-------------------+ |trip_id|end_station_id|duration| station_name| lat| long| +-------+--------------+--------+--------------------+------------------+-------------------+ | 4576| 66| 63|South Van Ness at...| 37.774814| -122.418954| | 4607| 10| null| San Jose City Hall| 37.337391| -121.886995| | 4130| 27| 71|Mountain View Cit...| 37.389218| -122.081896| | 4251| 10| 77| San Jose City Hall| 37.337391| -121.886995| | 4299| 67| 83| Market at 10th|37.776619000000004|-122.41738500000001| | 4927| 59| 103| Golden Gate at Polk| 37.781332| -122.418603| | 4500| 5| 109| Adobe on Almaden| 37.331415| -121.8932| | 4563| 8| 111| San Salvador at 1st| 37.330165|-121.88583100000001| | 4760| 66| 113|South Van Ness at...| 37.774814| -122.418954| | 4258| 11| 114| MLK Library| 37.335885|-121.88566000000002| | 4549| 54| 125|Embarcadero at Br...| 37.787152|-122.38801299999999| | 4498| 4| 126|Santa Clara at Al...| 37.333988| -121.894902| | 4965| 28| 129|Mountain View Cal...|37.394358000000004|-122.07671299999998| | 4557| 64| 130| 2nd at South Park| 37.782259| -122.392738| | 4386| 56| 134| Beale at Market| 37.792251|-122.39708600000002| | 4749| 47| 138| Post at Kearney| 37.788975| -122.403452| | 4242| 10| 141| San Jose City Hall| 37.337391| -121.886995| | 4329| 67| 142| Market at 10th|37.776619000000004|-122.41738500000001| | 5097| 50| 142|Harry Bridges Pla...| 37.795392| -122.394203| | 5084| 76| 144| Market at 4th| 37.786305|-122.40496599999999| +-------+--------------+--------+--------------------+------------------+-------------------+ only showing top 20 rows
Для каждой стартовой станции найдем среднее время поездки.
Рассчитаем среднее время поездки для каждого стартового парковочного места
spark.sql(""" SELECT start_station_name, avg(duration) FROM trips GROUP BY trips.start_station_name ORDER BY avg(duration) DESC """).show()
+--------------------+------------------+ | start_station_name| avg(duration)| +--------------------+------------------+ |University and Em...| 7230.231034482758| |Redwood City Medi...| 4764.68287037037| |San Jose Civic Ce...| 4720.621422376409| | Park at Olive| 4686.397612488521| |California Ave Ca...| 4502.620639534884| |Redwood City Publ...|3697.0892307692307| |Rengstorff Avenue...| 3544.797270955166| |Palo Alto Caltrai...| 3158.331498866947| |San Mateo County ...|3002.0827067669175| |South Van Ness at...|2936.8873503613395| |San Antonio Shopp...| 2508.434736091298| |Cowper at University| 2493.220572640509| | Broadway at Main|2481.2537313432836| |Redwood City Calt...| 2405.29409190372| | Japantown|2297.0913838120105| |San Antonio Caltr...|2103.7238932071646| |SJSU 4th at San C...| 1995.366021236727| |Washington at Kea...|1979.3077445652175| | Mezes Park|1918.1354359925788| |Arena Green / SAP...|1888.3390476190475| +--------------------+------------------+ only showing top 20 rows
Пример подготовки данных c Spark SQL, pandas, h3 для их визуализации на карте folium¶
# ! pip install h3 h3_pyspark pandas folium
Найдём велосипеды, которые ездили в рождество 2014 года. https://spark.apache.org/docs/latest/api/sql/#make_timestamp
# year - the year to represent, from 1 to 9999 # month - the month-of-year to represent, from 1 (January) to 12 (December) # day - the day-of-month to represent, from 1 to 31 # hour - the hour-of-day to represent, from 0 to 23 # min - the minute-of-hour to represent, from 0 to 59 # sec - the second-of-minute and its micro-fraction to represent, from 0 to 60. The value can be either an integer like 13 , or a fraction like 13.123. If the sec argument equals to 60, the seconds field is set to 0 and 1 minute is added to the final timestamp. # timezone - the time zone identifier. For example, CET, UTC and etc. spark.sql(""" SELECT bike_id, start_date, end_date FROM trips WHERE start_date > make_timestamp(2014, 12, 25, 0, 0, 0) AND start_date < make_timestamp(2014, 12, 26, 0, 0, 0) """).show()
+-------+-------------------+-------------------+ |bike_id| start_date| end_date| +-------+-------------------+-------------------+ | 379|2014-12-25 22:10:00|2014-12-25 22:18:00| | 709|2014-12-25 21:21:00|2014-12-25 21:27:00| | 376|2014-12-25 20:40:00|2014-12-25 20:46:00| | 541|2014-12-25 20:27:00|2014-12-25 20:32:00| | 283|2014-12-25 19:56:00|2014-12-25 20:01:00| | 519|2014-12-25 19:56:00|2014-12-25 20:01:00| | 583|2014-12-25 19:05:00|2014-12-25 19:07:00| | 495|2014-12-25 18:42:00|2014-12-25 18:44:00| | 541|2014-12-25 18:28:00|2014-12-25 18:37:00| | 585|2014-12-25 18:27:00|2014-12-25 18:37:00| | 574|2014-12-25 18:12:00|2014-12-25 18:21:00| | 630|2014-12-25 18:12:00|2014-12-25 18:22:00| | 583|2014-12-25 18:05:00|2014-12-25 18:22:00| | 290|2014-12-25 18:01:00|2014-12-25 18:15:00| | 451|2014-12-25 17:55:00|2014-12-25 18:04:00| | 630|2014-12-25 17:55:00|2014-12-25 17:59:00| | 574|2014-12-25 17:54:00|2014-12-25 17:59:00| | 463|2014-12-25 17:46:00|2014-12-25 17:53:00| | 628|2014-12-25 17:46:00|2014-12-25 17:53:00| | 58|2014-12-25 16:32:00|2014-12-25 17:04:00| +-------+-------------------+-------------------+ only showing top 20 rows
Найдём станции через которые проехал один из велосипедов, найденных ранее.
spark.sql(""" SELECT trips.bike_id, trips.start_date, trips.end_date, stations.name FROM trips INNER JOIN stations ON trips.start_station_id == stations.id WHERE bike_id == 583 AND start_date > make_timestamp(2014, 12, 25, 0, 0, 0) AND start_date < make_timestamp(2014, 12, 26, 0, 0, 0) """).show()
+-------+-------------------+-------------------+--------------+ |bike_id| start_date| end_date| name| +-------+-------------------+-------------------+--------------+ | 583|2014-12-25 19:05:00|2014-12-25 19:07:00|Market at 10th| | 583|2014-12-25 18:05:00|2014-12-25 18:22:00|Market at 10th| | 583|2014-12-25 12:14:00|2014-12-25 12:21:00| Market at 4th| | 583|2014-12-25 19:05:00|2014-12-25 19:07:00|Market at 10th| | 583|2014-12-25 18:05:00|2014-12-25 18:22:00|Market at 10th| | 583|2014-12-25 12:14:00|2014-12-25 12:21:00| Market at 4th| +-------+-------------------+-------------------+--------------+
Найдём все станции, которые попали в ту же клетку h3 координатной сетки что и станции, через которые проехал велосипед 583 25.12.2014.
Отобразим gps координаты станций в координаты h3.
from pyspark.sql import functions as F import h3_pyspark import h3
H3 Grid Resolutions https://h3geo.org/docs/core-library/restable/
resolution = 8 stationData.withColumn('h3', h3_pyspark.geo_to_h3('lat', 'long', sql.functions.lit(resolution))).createOrReplaceTempView("stations_h3")
Используя вложенный sql запрос, найдём h3 координаты станций, через который проехал велосипед 583. А затем отфильтруем поездки в рождество 2014 года, которые стартовали со станций с теми же h3 координатами, что мы нашли.
christmas_583_contacts = spark.sql(""" SELECT trips.bike_id, trips.start_date, stations_h3.h3, stations_h3.lat, stations_h3.long, stations_h3.name FROM trips INNER JOIN stations_h3 ON trips.start_station_id == stations_h3.id WHERE stations_h3.h3 in (SELECT stations_h3.h3 FROM trips INNER JOIN stations_h3 ON trips.start_station_id == stations_h3.id WHERE bike_id == 583 AND start_date > make_timestamp(2014, 12, 25, 0, 0, 0) AND start_date < make_timestamp(2014, 12, 26, 0, 0, 0)) AND start_date > make_timestamp(2014, 12, 25, 0, 0, 0) AND start_date < make_timestamp(2014, 12, 26, 0, 0, 0) ORDER BY trips.start_date """) christmas_583_contacts.cache() christmas_583_contacts.show()
+-------+-------------------+---------------+------------------+-------------------+--------------+ |bike_id| start_date| h3| lat| long| name| +-------+-------------------+---------------+------------------+-------------------+--------------+ | 439|2014-12-25 01:40:00|8828308281fffff|37.776619000000004|-122.41738500000001|Market at 10th| | 439|2014-12-25 01:40:00|8828308281fffff|37.776619000000004|-122.41738500000001|Market at 10th| | 659|2014-12-25 09:49:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 465|2014-12-25 09:49:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 659|2014-12-25 09:49:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 465|2014-12-25 09:49:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 583|2014-12-25 12:14:00|88283082abfffff| 37.786305|-122.40496599999999| Market at 4th| | 583|2014-12-25 12:14:00|88283082abfffff| 37.786305|-122.40496599999999| Market at 4th| | 479|2014-12-25 12:22:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 331|2014-12-25 12:22:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 479|2014-12-25 12:22:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 331|2014-12-25 12:22:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 330|2014-12-25 12:27:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 438|2014-12-25 12:27:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 330|2014-12-25 12:27:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 438|2014-12-25 12:27:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 428|2014-12-25 12:35:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 428|2014-12-25 12:35:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 214|2014-12-25 12:38:00|88283082abfffff|37.781752000000004|-122.40512700000001| 5th at Howard| | 292|2014-12-25 12:38:00|88283082abfffff| 37.786305|-122.40496599999999| Market at 4th| +-------+-------------------+---------------+------------------+-------------------+--------------+ only showing top 20 rows
import pandas as pd import h3 h3_places = christmas_583_contacts.select('lat','long', 'name', 'h3').toPandas()
# source code from https://nbviewer.org/github/uber/h3-py-notebooks/blob/master/notebooks/usage.ipynb import folium def init_map(hexagons, width=1100, height=900): lats = [] longs = [] for hexagon in hexagons: lat, long = h3.h3_to_geo(hexagon) lats.append(lat) longs.append(long) return folium.Map(location=[sum(lats)/len(lats), sum(longs)/len(longs)], zoom_start=15, tiles='cartodbpositron', width=width, height=height) def visualize_hexagons(folium_map, hexagons, color="red"): """ hexagons is a list of hexcluster. Each hexcluster is a list of hexagons. eg. [[hex1, hex2], [hex3, hex4]] """ polylines = [] lat = [] lng = [] for hex in hexagons: polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False) # flatten polygons into loops. outlines = [loop for polygon in polygons for loop in polygon] polyline = [outline + [outline[0]] for outline in outlines][0] lat.extend(map(lambda v:v[0], polyline)) lng.extend(map(lambda v:v[1], polyline)) polylines.append(polyline) for polyline in polylines: my_PolyLine = folium.PolyLine(locations=polyline, weight=8, color=color) folium_map.add_child(my_PolyLine) return folium_map def visualize_stations(folium_map, stations, color="red"): """ stations is a dataframe with columns: lat, long, station_name """ for idx, lat, long, station_name in stations.itertuples(): folium_map.add_child(folium.map.Marker(location=(lat, long))) folium_map.add_child(folium.map.Marker(location=(lat, long), icon=folium.features.DivIcon( icon_size=(500,36), icon_anchor=(-17,37), html=f'<div style="display: inline-block;font-size: 10pt; background: rgba(255, 255, 255, 0.8)">{station_name}</div>', ))) return folium_map
m = init_map(h3_places.h3.unique()) visualize_hexagons(m, h3_places.h3.unique(), color="black") visualize_stations(m, h3_places.loc[:, ['lat', 'long', 'name']]) display(m)
sc.stop()