Added DataFrame/SQL examples. Introduced two errors in first two rows of trips dataset to practice with non-available values.

master
Vladimir Protsenko 3 years ago
parent 7af60e60c9
commit d947b561a2

@ -42,7 +42,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 124, "execution_count": 128,
"id": "30002669-3799-4a39-831e-d276a4708f9a", "id": "30002669-3799-4a39-831e-d276a4708f9a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -52,7 +52,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 125, "execution_count": 130,
"id": "3ed7b961-7879-4937-ac24-11d615e091b8", "id": "3ed7b961-7879-4937-ac24-11d615e091b8",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -62,7 +62,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 126, "execution_count": 131,
"id": "0da718e8-7ad8-42f1-872f-c8805fd3c41c", "id": "0da718e8-7ad8-42f1-872f-c8805fd3c41c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -70,6 +70,18 @@
"sc = SparkContext(conf=conf)" "sc = SparkContext(conf=conf)"
] ]
}, },
{
"cell_type": "code",
"execution_count": 133,
"id": "47896fd0-906e-413d-8980-3948d73a3067",
"metadata": {},
"outputs": [],
"source": [
"tripData = sc.textFile(\"tripd_with_error.csv\")\n",
"tripsHeader = tripData.first()\n",
"trips = tripData.filter(lambda row: row != tripsHeader).map(lambda row: row.split(\",\", -1))"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,
@ -492,7 +504,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 62, "execution_count": 134,
"id": "84aa084c-8e7a-427b-9bd9-1b7e106c1ea2", "id": "84aa084c-8e7a-427b-9bd9-1b7e106c1ea2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -554,7 +566,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 80, "execution_count": 150,
"id": "9f728f0d-86ee-4d9e-9292-31ca3c1fae35", "id": "9f728f0d-86ee-4d9e-9292-31ca3c1fae35",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -574,7 +586,8 @@
" zip_code: str\n", " zip_code: str\n",
" \n", " \n",
" for trip in trips:\n", " for trip in trips:\n",
" yield Trip( \n", " try:\n",
" yield Trip( \n",
" trip_id = int(trip[0]),\n", " trip_id = int(trip[0]),\n",
" duration = int(trip[1]),\n", " duration = int(trip[1]),\n",
" start_date = datetime.strptime(trip[2], '%m/%d/%Y %H:%M'),\n", " start_date = datetime.strptime(trip[2], '%m/%d/%Y %H:%M'),\n",
@ -586,29 +599,40 @@
" bike_id = int(trip[8]),\n", " bike_id = int(trip[8]),\n",
" subscription_type = trip[9],\n", " subscription_type = trip[9],\n",
" zip_code = trip[10]\n", " zip_code = trip[10]\n",
" )" " ) \n",
" except:\n",
" pass"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 81, "execution_count": 151,
"id": "c265426b-f20b-426e-aeba-7295bc797f3b", "id": "c265426b-f20b-426e-aeba-7295bc797f3b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Trip(trip_id=4576, duration=63, start_date=datetime.datetime(2013, 8, 29, 14, 13), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 14, 14), end_station_name='South Van Ness at Market', end_station_id='66', bike_id=520, subscription_type='Subscriber', zip_code='94127')" "[Trip(trip_id=4607, duration=70, start_date=datetime.datetime(2013, 8, 29, 14, 42), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 14, 43), end_station_name='San Jose City Hall', end_station_id='10', bike_id=661, subscription_type='Subscriber', zip_code='95138'),\n",
" Trip(trip_id=4130, duration=71, start_date=datetime.datetime(2013, 8, 29, 10, 16), start_station_name='Mountain View City Hall', start_station_id=27, end_date=datetime.datetime(2013, 8, 29, 10, 17), end_station_name='Mountain View City Hall', end_station_id='27', bike_id=48, subscription_type='Subscriber', zip_code='97214'),\n",
" Trip(trip_id=4251, duration=77, start_date=datetime.datetime(2013, 8, 29, 11, 29), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 11, 30), end_station_name='San Jose City Hall', end_station_id='10', bike_id=26, subscription_type='Subscriber', zip_code='95060'),\n",
" Trip(trip_id=4299, duration=83, start_date=datetime.datetime(2013, 8, 29, 12, 2), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 12, 4), end_station_name='Market at 10th', end_station_id='67', bike_id=319, subscription_type='Subscriber', zip_code='94103'),\n",
" Trip(trip_id=4927, duration=103, start_date=datetime.datetime(2013, 8, 29, 18, 54), start_station_name='Golden Gate at Polk', start_station_id=59, end_date=datetime.datetime(2013, 8, 29, 18, 56), end_station_name='Golden Gate at Polk', end_station_id='59', bike_id=527, subscription_type='Subscriber', zip_code='94109'),\n",
" Trip(trip_id=4500, duration=109, start_date=datetime.datetime(2013, 8, 29, 13, 25), start_station_name='Santa Clara at Almaden', start_station_id=4, end_date=datetime.datetime(2013, 8, 29, 13, 27), end_station_name='Adobe on Almaden', end_station_id='5', bike_id=679, subscription_type='Subscriber', zip_code='95112'),\n",
" Trip(trip_id=4563, duration=111, start_date=datetime.datetime(2013, 8, 29, 14, 2), start_station_name='San Salvador at 1st', start_station_id=8, end_date=datetime.datetime(2013, 8, 29, 14, 4), end_station_name='San Salvador at 1st', end_station_id='8', bike_id=687, subscription_type='Subscriber', zip_code='95112'),\n",
" Trip(trip_id=4760, duration=113, start_date=datetime.datetime(2013, 8, 29, 17, 1), start_station_name='South Van Ness at Market', start_station_id=66, end_date=datetime.datetime(2013, 8, 29, 17, 3), end_station_name='South Van Ness at Market', end_station_id='66', bike_id=553, subscription_type='Subscriber', zip_code='94103'),\n",
" Trip(trip_id=4258, duration=114, start_date=datetime.datetime(2013, 8, 29, 11, 33), start_station_name='San Jose City Hall', start_station_id=10, end_date=datetime.datetime(2013, 8, 29, 11, 35), end_station_name='MLK Library', end_station_id='11', bike_id=107, subscription_type='Subscriber', zip_code='95060'),\n",
" Trip(trip_id=4549, duration=125, start_date=datetime.datetime(2013, 8, 29, 13, 52), start_station_name='Spear at Folsom', start_station_id=49, end_date=datetime.datetime(2013, 8, 29, 13, 55), end_station_name='Embarcadero at Bryant', end_station_id='54', bike_id=368, subscription_type='Subscriber', zip_code='94109')]"
] ]
}, },
"execution_count": 81, "execution_count": 151,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"tripsInternal = trips.mapPartitions(initTrip)\n", "tripsInternal = trips.mapPartitions(initTrip)\n",
"tripsInternal.first()" "tripsInternal.take(10)"
] ]
}, },
{ {
@ -969,7 +993,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 127, "execution_count": 152,
"id": "c98261f7-283c-4c7e-b915-3778ff972f5f", "id": "c98261f7-283c-4c7e-b915-3778ff972f5f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],

@ -1,6 +1,6 @@
id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127 4576,63,,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127
4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138 4607,,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138
4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214 4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214
4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060 4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060
4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103 4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103

Can't render this file because it is too large.
Loading…
Cancel
Save