Skip to content

Commit

Permalink
Merge pull request #1538 from microsoft/laserprec/opt-unit-tests-exp
Browse files Browse the repository at this point in the history
Optimize Notebook Unit Tests
  • Loading branch information
Jianjie Liu authored Oct 11, 2021
2 parents 846d214 + e8fd200 commit 68066dd
Show file tree
Hide file tree
Showing 20 changed files with 1,063 additions and 1,510 deletions.
296 changes: 151 additions & 145 deletions examples/00_quick_start/als_movielens.ipynb

Large diffs are not rendered by default.

531 changes: 273 additions & 258 deletions examples/02_model_collaborative_filtering/als_deep_dive.ipynb

Large diffs are not rendered by default.

102 changes: 53 additions & 49 deletions examples/03_evaluate/als_movielens_diversity_metrics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@
"output_type": "stream",
"name": "stdout",
"text": [
"System version: 3.6.13 |Anaconda, Inc.| (default, Jun 4 2021, 14:25:59) \n",
"[GCC 7.5.0]\n",
"System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n",
"[GCC 8.4.0]\n",
"Spark version: 2.4.8\n"
]
}
Expand Down Expand Up @@ -206,7 +206,9 @@
"# user, item column names\n",
"COL_USER=\"UserId\"\n",
"COL_ITEM=\"MovieId\"\n",
"COL_RATING=\"Rating\""
"COL_RATING=\"Rating\"\n",
"COL_TITLE=\"Title\"\n",
"COL_GENRE=\"Genre\""
],
"outputs": [],
"metadata": {
Expand Down Expand Up @@ -259,23 +261,23 @@
" )\n",
")\n",
"\n",
"data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=\"title\", genres_col=\"genres\")\n",
"data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n",
"data.show()"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"100%|██████████| 4.81k/4.81k [00:00<00:00, 17.1kKB/s]\n"
"100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"+-------+------+------+---------+--------------------+------+\n",
"|MovieId|UserId|Rating|Timestamp| title|genres|\n",
"|MovieId|UserId|Rating|Timestamp| Title| Genre|\n",
"+-------+------+------+---------+--------------------+------+\n",
"| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n",
"| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n",
Expand Down Expand Up @@ -406,7 +408,7 @@
"output_type": "stream",
"name": "stdout",
"text": [
"Took 4.012367556002573 seconds for training.\n"
"Took 4.189040212018881 seconds for training.\n"
]
}
],
Expand Down Expand Up @@ -563,9 +565,9 @@
" test_df, \n",
" top_all, \n",
" k = TOP_K, \n",
" col_user=\"UserId\", \n",
" col_item=\"MovieId\",\n",
" col_rating=\"Rating\", \n",
" col_user=COL_USER, \n",
" col_item=COL_ITEM,\n",
" col_rating=COL_RATING, \n",
" col_prediction=\"prediction\",\n",
" relevancy_method=\"top_k\"\n",
")\n",
Expand Down Expand Up @@ -735,15 +737,15 @@
" <td>100k</td>\n",
" <td>random</td>\n",
" <td>10</td>\n",
" <td>0.016755</td>\n",
" <td>0.005883</td>\n",
" <td>0.017849</td>\n",
" <td>0.001890</td>\n",
" <td>0.996326</td>\n",
" <td>10.540834</td>\n",
" <td>12.133664</td>\n",
" <td>0.922288</td>\n",
" <td>0.893001</td>\n",
" <td>0.016543</td>\n",
" <td>0.005566</td>\n",
" <td>0.016373</td>\n",
" <td>0.001441</td>\n",
" <td>0.994489</td>\n",
" <td>10.541850</td>\n",
" <td>12.136439</td>\n",
" <td>0.922613</td>\n",
" <td>0.892511</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -752,15 +754,15 @@
"text/plain": [
" Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n",
"1 100k als 10 0.047296 0.016015 0.043097 0.004579 \n",
"2 100k random 10 0.016755 0.005883 0.017849 0.001890 \n",
"2 100k random 10 0.016543 0.005566 0.016373 0.001441 \n",
"\n",
" catalog_coverage distributional_coverage novelty diversity \\\n",
"1 0.385793 7.967257 11.659776 0.892277 \n",
"2 0.996326 10.540834 12.133664 0.922288 \n",
"2 0.994489 10.541850 12.136439 0.922613 \n",
"\n",
" serendipity \n",
"1 0.878733 \n",
"2 0.893001 "
"2 0.892511 "
]
},
"metadata": {},
Expand Down Expand Up @@ -791,10 +793,10 @@
"source": [
"# Get movie features \"title\" and \"genres\"\n",
"movies = (\n",
" data.groupBy(\"MovieId\", \"title\", \"genres\").count()\n",
" data.groupBy(COL_ITEM, COL_TITLE, COL_GENRE).count()\n",
" .na.drop() # remove rows with null values\n",
" .withColumn(\"genres\", F.split(F.col(\"genres\"), \"\\|\")) # convert to array of genres\n",
" .withColumn(\"title\", F.regexp_replace(F.col(\"title\"), \"[\\(),:^0-9]\", \"\")) # remove year from title\n",
" .withColumn(COL_GENRE, F.split(F.col(COL_GENRE), \"\\|\")) # convert to array of genres\n",
" .withColumn(COL_TITLE, F.regexp_replace(F.col(COL_TITLE), \"[\\(),:^0-9]\", \"\")) # remove year from title\n",
" .drop(\"count\") # remove unused columns\n",
")"
],
Expand All @@ -806,12 +808,12 @@
"execution_count": 25,
"source": [
"# tokenize \"title\" column\n",
"title_tokenizer = Tokenizer(inputCol=\"title\", outputCol=\"title_words\")\n",
"title_tokenizer = Tokenizer(inputCol=COL_TITLE, outputCol=\"title_words\")\n",
"tokenized_data = title_tokenizer.transform(movies)\n",
"\n",
"# remove stop words\n",
"remover = StopWordsRemover(inputCol=\"title_words\", outputCol=\"text\")\n",
"clean_data = remover.transform(tokenized_data).drop(\"title\", \"title_words\")"
"clean_data = remover.transform(tokenized_data).drop(COL_TITLE, \"title_words\")"
],
"outputs": [],
"metadata": {}
Expand All @@ -827,7 +829,7 @@
"hashed_data = text_hasher.transform(clean_data)\n",
"\n",
"# step 2: fit a CountVectorizerModel from column \"genres\".\n",
"count_vectorizer = CountVectorizer(inputCol=\"genres\", outputCol=\"genres_features\")\n",
"count_vectorizer = CountVectorizer(inputCol=COL_GENRE, outputCol=\"genres_features\")\n",
"count_vectorizer_model = count_vectorizer.fit(hashed_data)\n",
"vectorized_data = count_vectorizer_model.transform(hashed_data)\n",
"\n",
Expand All @@ -836,7 +838,7 @@
" inputCols=[\"text_features\", \"genres_features\"],\n",
" outputCol=\"features\",\n",
")\n",
"feature_data = assembler.transform(vectorized_data).select(\"MovieId\", \"features\")\n",
"feature_data = assembler.transform(vectorized_data).select(COL_ITEM, \"features\")\n",
"\n",
"feature_data.show(10, False)"
],
Expand All @@ -845,20 +847,20 @@
"output_type": "stream",
"name": "stdout",
"text": [
"+-------+---------------------------------------------+\n",
"|MovieId|features |\n",
"+-------+---------------------------------------------+\n",
"|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n",
"|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n",
"|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n",
"|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n",
"|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n",
"|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n",
"|1118 |(1043,[702,1025],[1.0,1.0]) |\n",
"|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n",
"|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n",
"|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n",
"+-------+---------------------------------------------+\n",
"+------+---------------------------------------------+\n",
"|ItemId|features |\n",
"+------+---------------------------------------------+\n",
"|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n",
"|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n",
"|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n",
"|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n",
"|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n",
"|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n",
"|1118 |(1043,[702,1025],[1.0,1.0]) |\n",
"|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n",
"|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n",
"|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n",
"+------+---------------------------------------------+\n",
"only showing top 10 rows\n",
"\n"
]
Expand Down Expand Up @@ -926,8 +928,8 @@
"output_type": "stream",
"name": "stdout",
"text": [
"0.8978120851519519\n",
"0.8937850286817351\n"
"0.8982144953920664\n",
"0.8941807579293202\n"
]
}
],
Expand Down Expand Up @@ -965,9 +967,8 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python (reco_pyspark)",
"language": "python",
"name": "reco_pyspark"
"name": "python3",
"display_name": "Python 3.6.9 64-bit ('.env': venv)"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -979,7 +980,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
"version": "3.6.9"
},
"interpreter": {
"hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 68066dd

Please sign in to comment.