|
|
|
@ -42,7 +42,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 124,
|
|
|
|
|
"execution_count": 128,
|
|
|
|
|
"id": "30002669-3799-4a39-831e-d276a4708f9a",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@ -52,7 +52,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 125,
|
|
|
|
|
"execution_count": 130,
|
|
|
|
|
"id": "3ed7b961-7879-4937-ac24-11d615e091b8",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@ -62,7 +62,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 126,
|
|
|
|
|
"execution_count": 131,
|
|
|
|
|
"id": "0da718e8-7ad8-42f1-872f-c8805fd3c41c",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@ -70,6 +70,18 @@
|
|
|
|
|
"sc = SparkContext(conf=conf)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 133,
|
|
|
|
|
"id": "47896fd0-906e-413d-8980-3948d73a3067",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"tripData = sc.textFile(\"tripd_with_error.csv\")\n",
|
|
|
|
|
"tripsHeader = tripData.first()\n",
|
|
|
|
|
"trips = tripData.filter(lambda row: row != tripsHeader).map(lambda row: row.split(\",\", -1))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 10,
|
|
|
|
@ -492,7 +504,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 62,
|
|
|
|
|
"execution_count": 134,
|
|
|
|
|
"id": "84aa084c-8e7a-427b-9bd9-1b7e106c1ea2",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@ -554,7 +566,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 80,
|
|
|
|
|
"execution_count": 150,
|
|
|
|
|
"id": "9f728f0d-86ee-4d9e-9292-31ca3c1fae35",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
@ -574,7 +586,8 @@
|
|
|
|
|
" zip_code: str\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" for trip in trips:\n",
|
|
|
|
|
" yield Trip( \n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" yield Trip( \n",
|
|
|
|
|
" trip_id = int(trip[0]),\n",
|
|
|
|
|
" duration = int(trip[1]),\n",
|
|
|
|
|
" start_date = datetime.strptime(trip[2], '%m/%d/%Y %H:%M'),\n",
|
|
|
|
@ -586,29 +599,40 @@
|
|
|
|
|
" bike_id = int(trip[8]),\n",
|
|
|
|
|
" subscription_type = trip[9],\n",
|
|
|
|
|
" zip_code = trip[10]\n",
|
|
|
|
|
" )"
|
|
|
|
|
" ) \n",
|
|
|
|
|
" except:\n",
|
|
|
|
|
" pass"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 81,
|
|
|
|
|
"execution_count": 151,
|
|
|
|
|
"id": "c265426b-f20b-426e-aeba-7295bc797f3b",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"Trip(trip_id=4576, duration=63, start_date=datetime.datetime(2013, 8, 29, 14, 13), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 14, 14), end_station_name='South Van Ness at Market', end_station_id='66', bike_id=520, subscription_type='Subscriber', zip_code='94127')"
|
|
|
|
|
"[Trip(trip_id=4607, duration=70, start_date=datetime.datetime(2013, 8, 29, 14, 42), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 14, 43), end_station_name='San Jose City Hall', end_station_id='10', bike_id=661, subscription_type='Subscriber', zip_code='95138'),\n",
|
|
|
|
|
" Trip(trip_id=4130, duration=71, start_date=datetime.datetime(2013, 8, 29, 10, 16), start_station_name='Mountain View City Hall', start_station_id=27, end_date=datetime.datetime(2013, 8, 29, 10, 17), end_station_name='Mountain View City Hall', end_station_id='27', bike_id=48, subscription_type='Subscriber', zip_code='97214'),\n",
|
|
|
|
|
" Trip(trip_id=4251, duration=77, start_date=datetime.datetime(2013, 8, 29, 11, 29), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 11, 30), end_station_name='San Jose City Hall', end_station_id='10', bike_id=26, subscription_type='Subscriber', zip_code='95060'),\n",
|
|
|
|
|
" Trip(trip_id=4299, duration=83, start_date=datetime.datetime(2013, 8, 29, 12, 2), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 12, 4), end_station_name='Market at 10th', end_station_id='67', bike_id=319, subscription_type='Subscriber', zip_code='94103'),\n",
|
|
|
|
|
" Trip(trip_id=4927, duration=103, start_date=datetime.datetime(2013, 8, 29, 18, 54), start_station_name='Golden Gate at Polk', start_station_id=59, end_date=datetime.datetime(2013, 8, 29, 18, 56), end_station_name='Golden Gate at Polk', end_station_id='59', bike_id=527, subscription_type='Subscriber', zip_code='94109'),\n",
|
|
|
|
|
" Trip(trip_id=4500, duration=109, start_date=datetime.datetime(2013, 8, 29, 13, 25), start_station_name='Santa Clara at Almaden', start_station_id=4, end_date=datetime.datetime(2013, 8, 29, 13, 27), end_station_name='Adobe on Almaden', end_station_id='5', bike_id=679, subscription_type='Subscriber', zip_code='95112'),\n",
|
|
|
|
|
" Trip(trip_id=4563, duration=111, start_date=datetime.datetime(2013, 8, 29, 14, 2), start_station_name='San Salvador at 1st', start_station_id=8, end_date=datetime.datetime(2013, 8, 29, 14, 4), end_station_name='San Salvador at 1st', end_station_id='8', bike_id=687, subscription_type='Subscriber', zip_code='95112'),\n",
|
|
|
|
|
" Trip(trip_id=4760, duration=113, start_date=datetime.datetime(2013, 8, 29, 17, 1), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 17, 3), end_station_name='South Van Ness at Market', end_station_id='66', bike_id=553, subscription_type='Subscriber', zip_code='94103'),\n",
|
|
|
|
|
" Trip(trip_id=4258, duration=114, start_date=datetime.datetime(2013, 8, 29, 11, 33), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 11, 35), end_station_name='MLK Library', end_station_id='11', bike_id=107, subscription_type='Subscriber', zip_code='95060'),\n",
|
|
|
|
|
" Trip(trip_id=4549, duration=125, start_date=datetime.datetime(2013, 8, 29, 13, 52), start_station_name='Spear at Folsom', start_station_id=49, end_date=datetime.datetime(2013, 8, 29, 13, 55), end_station_name='Embarcadero at Bryant', end_station_id='54', bike_id=368, subscription_type='Subscriber', zip_code='94109')]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 81,
|
|
|
|
|
"execution_count": 151,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"tripsInternal = trips.mapPartitions(initTrip)\n",
|
|
|
|
|
"tripsInternal.first()"
|
|
|
|
|
"tripsInternal.take(10)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -969,7 +993,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 127,
|
|
|
|
|
"execution_count": 152,
|
|
|
|
|
"id": "c98261f7-283c-4c7e-b915-3778ff972f5f",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|