diff --git a/data-analysis/data_analysis_findings.ipynb b/data-analysis/data_analysis_findings.ipynb index 88d3d3a8aa..731206aeec 100644 --- a/data-analysis/data_analysis_findings.ipynb +++ b/data-analysis/data_analysis_findings.ipynb @@ -166,7 +166,8 @@ "import pandas as pd\n", "\n", "james_bond_data_html = pd.read_html(\n", - " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\",\n", + " storage_options={\"User-Agent\": \"Mozilla/5.0\"},\n", ")\n", "james_bond_tables = james_bond_data_html[1].convert_dtypes()" ] @@ -305,7 +306,7 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " )\n", @@ -327,17 +328,17 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " )\n", @@ -367,21 +368,21 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " film_length=lambda data: (\n", - " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " data[\"film_length\"].str.removesuffix(\" mins\").astype(\"Int64\")\n", " ),\n", " )\n", ")" @@ -442,21 +443,21 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " film_length=lambda data: (\n", - " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " data[\"film_length\"].str.removesuffix(\" mins\").astype(\"Int64\")\n", " ),\n", " release_date=lambda data: pd.to_datetime(\n", " data[\"release_date\"], format=\"%B, %Y\"\n", @@ -529,22 +530,22 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", - " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " data[\"film_length\"].str.removesuffix(\" mins\").astype(\"Int64\")\n", " ),\n", " release_date=lambda data: pd.to_datetime(\n", " data[\"release_date\"], format=\"%B, %Y\"\n", @@ -597,22 +598,22 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", - " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " data[\"film_length\"].str.removesuffix(\" mins\").astype(\"Int64\")\n", " ),\n", " release_date=lambda data: pd.to_datetime(\n", " data[\"release_date\"], format=\"%B, %Y\"\n", @@ -662,22 +663,22 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", - " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " data[\"film_length\"].str.removesuffix(\" mins\").astype(\"Int64\")\n", " ),\n", " release_date=lambda data: pd.to_datetime(\n", " data[\"release_date\"], format=\"%B, %Y\"\n", @@ -738,23 +739,23 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", " data[\"film_length\"]\n", - " .str.removesuffix(\"mins\")\n", + " .str.removesuffix(\" mins\")\n", " .astype(\"Int64\")\n", " .replace(1200, 120)\n", " ),\n", @@ -820,23 +821,23 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", " data[\"film_length\"]\n", - " .str.removesuffix(\"mins\")\n", + " .str.removesuffix(\" mins\")\n", " .astype(\"Int64\")\n", " .replace(1200, 120)\n", " ),\n", diff --git a/data-analysis/data_analysis_results.ipynb b/data-analysis/data_analysis_results.ipynb index 396d715a0b..9b4c9a1adc 100644 --- a/data-analysis/data_analysis_results.ipynb +++ b/data-analysis/data_analysis_results.ipynb @@ -54,23 +54,23 @@ " .assign(\n", " income_usa=lambda data: (\n", " data[\"income_usa\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " income_world=lambda data: (\n", " data[\"income_world\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " ),\n", " movie_budget=lambda data: (\n", " data[\"movie_budget\"]\n", - " .replace(\"[$,]\", \"\", regex=True)\n", + " .replace(r\"[$,\\s]\", \"\", regex=True)\n", " .astype(\"Float64\")\n", " * 1000\n", " ),\n", " film_length=lambda data: (\n", " data[\"film_length\"]\n", - " .str.removesuffix(\"mins\")\n", + " .str.removesuffix(\" mins\")\n", " .astype(\"Int64\")\n", " .replace(1200, 120)\n", " ),\n", diff --git a/data-analysis/james_bond_data_cleansed.csv b/data-analysis/james_bond_data_cleansed.csv index b01499462a..0afda2076b 100644 --- a/data-analysis/james_bond_data_cleansed.csv +++ b/data-analysis/james_bond_data_cleansed.csv @@ -1,26 +1,26 @@ -bond_actor,bond_kills,car_manufacturer,film_length,imdb,income_usa,income_world,martinis_consumed,movie_budget,movie_title,release_date,rotten_tomatoes,release_year -Sean Connery,4,Sunbeam,110,7.3,16067035.0,59567035.0,2,1000000.0,Dr. No,1962-06-01,7.7,1962 -Sean Connery,11,Bentley,115,7.5,24800000.0,78900000.0,0,2000000.0,From Russia with Love,1963-08-01,8.0,1963 -Sean Connery,9,Aston Martin,110,7.8,51100000.0,124900000.0,1,3000000.0,Goldfinger,1964-05-01,8.4,1964 -Sean Connery,20,Aston Martin,130,7.0,63600000.0,141200000.0,0,9000000.0,Thunderball,1965-09-01,6.8,1965 -Sean Connery,21,Toyota,117,6.9,43100000.0,111600000.0,1,9500000.0,You Only Live Twice,1967-11-01,6.3,1967 -George Lazenby,5,Mercury,142,6.8,22800000.0,82000000.0,1,8000000.0,On Her Majesty's Secret Service,1969-07-01,6.7,1969 -Sean Connery,7,Ford,120,6.7,43800000.0,116000000.0,0,7200000.0,Diamonds Are Forever,1971-03-01,6.3,1971 -Roger Moore,8,AMC,121,6.8,35400000.0,161800000.0,0,7000000.0,Live and Let Die,1973-08-01,5.9,1973 -Roger Moore,1,AMC,125,6.7,21000000.0,97600000.0,0,7000000.0,The Man with the Golden Gun,1974-07-01,5.1,1974 -Roger Moore,31,Lotus,125,7.1,46800000.0,185400000.0,1,14000000.0,The Spy Who Loved Me,1977-04-01,6.8,1977 -Roger Moore,12,Lotus,126,6.2,70300000.0,210300000.0,1,31000000.0,Moonraker,1979-10-01,5.7,1979 -Roger Moore,18,Citroen,127,6.8,54800000.0,195300000.0,0,28000000.0,For Your Eyes Only,1981-06-01,6.3,1981 -Roger Moore,15,Bajaj,131,6.5,67900000.0,187500000.0,0,27500000.0,Octopussy,1983-03-01,5.3,1983 -Roger Moore,5,Rolls Royce,131,6.2,50327960.0,152627960.0,0,30000000.0,A View to a Kill,1985-10-01,4.7,1985 -Timothy Dalton,13,Rolls Royce,130,6.7,51185000.0,191200000.0,2,40000000.0,The Living Daylights,1987-05-01,6.3,1987 -Timothy Dalton,10,Aston Martin,133,6.5,34667015.0,156167015.0,1,42000000.0,License to Kill,1989-01-01,6.0,1989 -Pierce Brosnan,47,BMW,130,7.2,106429941.0,356429941.0,1,60000000.0,GoldenEye,1995-09-01,6.9,1995 -Pierce Brosnan,30,Aston Martin,119,6.4,125304276.0,339504276.0,1,110000000.0,Tomorrow Never Dies,1997-07-01,6.0,1997 -Pierce Brosnan,27,BMW,128,6.3,126930660.0,361730660.0,1,135000000.0,The World Is Not Enough,1999-06-01,5.7,1999 -Pierce Brosnan,31,Aston Martin,133,6.0,160942139.0,431942139.0,2,142000000.0,Die Another Day,2002-08-01,6.1,2002 -Daniel Craig,11,Aston Martin,144,7.9,167365000.0,596365000.0,3,102000000.0,Casino Royale,2006-02-01,7.8,2006 -Daniel Craig,16,Aston Martin,106,6.7,169368427.0,591692078.0,6,230000000.0,Quantum of Solace,2008-12-01,6.1,2008 -Daniel Craig,26,Aston Martin,143,7.8,304360277.0,1108561108.0,1,200000000.0,Skyfall,2012-11-01,8.2,2012 -Daniel Craig,30,Aston Martin,148,6.8,200074175.0,879620923.0,1,245000000.0,Spectre,2015-09-01,6.4,2015 -Daniel Craig,14,Aston Martin,163,7.3,160891007.0,759959662.0,1,275000000.0,No Time to Die,2021-11-01,7.3,2021 +release_date,movie_title,bond_actor,car_manufacturer,income_usa,income_world,movie_budget,film_length,imdb,rotten_tomatoes,martinis_consumed,bond_kills,release_year +1962-06-01,Dr. No,Sean Connery,Sunbeam,16067035.0,59567035.0,1000000.0,110,7.3,7.7,2,4,1962 +1963-08-01,From Russia with Love,Sean Connery,Bentley,24800000.0,78900000.0,2000000.0,115,7.5,8.0,0,11,1963 +1964-05-01,Goldfinger,Sean Connery,Aston Martin,51100000.0,124900000.0,3000000.0,110,7.8,8.4,1,9,1964 +1965-09-01,Thunderball,Sean Connery,Aston Martin,63600000.0,141200000.0,9000000.0,130,7.0,6.8,0,20,1965 +1967-11-01,You Only Live Twice,Sean Connery,Toyota,43100000.0,111600000.0,9500000.0,117,6.9,6.3,1,21,1967 +1969-07-01,On Her Majesty's Secret Service,George Lazenby,Mercury,22800000.0,82000000.0,8000000.0,142,6.8,6.7,1,5,1969 +1971-03-01,Diamonds Are Forever,Sean Connery,Ford,43800000.0,116000000.0,7200000.0,120,6.7,6.3,0,7,1971 +1973-08-01,Live and Let Die,Roger Moore,AMC,35400000.0,161800000.0,7000000.0,121,6.8,5.9,0,8,1973 +1974-07-01,The Man with the Golden Gun,Roger Moore,AMC,21000000.0,97600000.0,7000000.0,125,6.7,5.1,0,1,1974 +1977-04-01,The Spy Who Loved Me,Roger Moore,Lotus,46800000.0,185400000.0,14000000.0,125,7.1,6.8,1,31,1977 +1979-10-01,Moonraker,Roger Moore,Lotus,70300000.0,210300000.0,31000000.0,126,6.2,5.7,1,12,1979 +1981-06-01,For Your Eyes Only,Roger Moore,Citroen,54800000.0,195300000.0,28000000.0,127,6.8,6.3,0,18,1981 +1983-03-01,Octopussy,Roger Moore,Bajaj,67900000.0,187500000.0,27500000.0,131,6.5,5.3,0,15,1983 +1985-10-01,A View to a Kill,Roger Moore,Rolls Royce,50327960.0,152627960.0,30000000.0,131,6.2,4.7,0,5,1985 +1987-05-01,The Living Daylights,Timothy Dalton,Rolls Royce,51185000.0,191200000.0,40000000.0,130,6.7,6.3,2,13,1987 +1989-01-01,License to Kill,Timothy Dalton,Aston Martin,34667015.0,156167015.0,42000000.0,133,6.5,6.0,1,10,1989 +1995-09-01,GoldenEye,Pierce Brosnan,BMW,106429941.0,356429941.0,60000000.0,130,7.2,6.9,1,47,1995 +1997-07-01,Tomorrow Never Dies,Pierce Brosnan,Aston Martin,125304276.0,339504276.0,110000000.0,119,6.4,6.0,1,30,1997 +1999-06-01,The World Is Not Enough,Pierce Brosnan,BMW,126930660.0,361730660.0,135000000.0,128,6.3,5.7,1,27,1999 +2002-08-01,Die Another Day,Pierce Brosnan,Aston Martin,160942139.0,431942139.0,142000000.0,133,6.0,6.1,2,31,2002 +2006-02-01,Casino Royale,Daniel Craig,Aston Martin,167365000.0,596365000.0,102000000.0,144,7.9,7.8,3,11,2006 +2008-12-01,Quantum of Solace,Daniel Craig,Aston Martin,169368427.0,591692078.0,230000000.0,106,6.7,6.1,6,16,2008 +2012-11-01,Skyfall,Daniel Craig,Aston Martin,304360277.0,1108561108.0,200000000.0,143,7.8,8.2,1,26,2012 +2015-09-01,Spectre,Daniel Craig,Aston Martin,200074175.0,879620923.0,245000000.0,148,6.8,6.4,1,30,2015 +2021-11-01,No Time to Die,Daniel Craig,Aston Martin,160891007.0,759959662.0,275000000.0,163,7.3,7.3,1,14,2021 diff --git a/data-analysis/requirements.txt b/data-analysis/requirements.txt new file mode 100644 index 0000000000..13a100581c --- /dev/null +++ b/data-analysis/requirements.txt @@ -0,0 +1,6 @@ +pandas==3.0.3 +matplotlib==3.10.9 +scikit-learn==1.9.0 +openpyxl==3.1.5 +pyarrow==24.0.0 +lxml==6.1.1