2,956
edits
(Created page with "<syntaxhighlight lang="sql"> -- This is used in the methodology of various olam.wiki pages -- Copyright Elie Goldman Smith, 2022 (Creative Commons license) -- Run this in SQLite3 ---------------------- -- Load data from various sources .mode csv -- cross-reference: crops (FAO data) with foods (USDA nutrition data): .import fao_usda.csv fao_usda -- nutrition (USDA): .import data/sr28-collated.csv...") |
No edit summary |
||
Line 14: | Line 14: | ||
-- crop production (FAO): | -- crop production (FAO): | ||
.import data/primary/fao/crops-primary.csv fao | .import data/primary/fao/crops-primary.csv fao | ||
-- population of regions (World Bank): | -- population of regions (World Bank): | ||
.import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop | .import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop | ||
-- global hunger index (.com) | -- global hunger index (.com) | ||
.import data/global-hunger-index-2021.csv ghi | .import data/global-hunger-index-2021.csv ghi | ||
-- region codes: | |||
.import data/primary/country-and-continent-codes-list-csv_csv.csv region | |||
.mode list | .mode list | ||
Line 40: | Line 40: | ||
-- Generate a secondary table for easier querying. | -- Generate a secondary table 'main' for easier querying. | ||
-- It's big, and it intentionally contains some redundancy. | -- It's big, and it intentionally contains some redundancy. | ||
CREATE TABLE | CREATE TEMPORARY TABLE almost_main AS | ||
SELECT | SELECT | ||
region_name, region_m49, | region_name, region_iso3, region_m49, region_population, region_cont, region_ghi,-- these 6 columns are always correlated | ||
crop_name, food_name, food_code, | crop_name, food_name, food_code, -- these 3 columns are always correlated | ||
production_food_grams_per_day, | production_food_grams_per_day, | ||
production_food_grams_per_day * fat_fraction_of_food AS production_fat_grams_per_day, | production_food_grams_per_day * fat_fraction_of_food AS production_fat_grams_per_day, | ||
Line 57: | Line 57: | ||
FROM ( | FROM ( | ||
SELECT | SELECT | ||
fao0.`Area Code (ISO3)`AS region_iso3, -- Technically `Area Code (ISO3)` is a "bare column", but it ALWAYS correlates with `Area`, so it's all good. | |||
fao0.`Area` AS region_name, | |||
fao0.`Item` AS crop_name, | |||
usda.`Long_desc` AS food_name, | |||
usda.`Long_desc` | usda.`NDB_No` AS food_code, | ||
usda.`NDB_No` | reg0.`Country_Number` AS region_m49, | ||
reg0.`Continent_Name` AS region_cont, | |||
-- population from most recent year available: | -- population from most recent year available: | ||
COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`) | COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`) | ||
AS region_population, | AS region_population, | ||
-- global hunger index from most recent year available: | -- global hunger index from most recent year available: | ||
Line 131: | Line 131: | ||
FROM usda | FROM usda | ||
JOIN fao_usda USING(`NDB_No`) | JOIN fao_usda USING(`NDB_No`) | ||
JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production" GROUP BY `Area`,`Item`) p | JOIN (SELECT * FROM fao GROUP BY `Area`,`Item`) fao0 USING(`Item Code (FAO)`) -- Technically `Item Code (FAO)` is a "bare column", but it ALWAYS correlates with `Item`, so it's all good. | ||
JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield" GROUP BY `Area`,`Item`) y | LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production" GROUP BY `Area`,`Item`) p ON fao0.`Area`=p.`Area` AND fao0.`Item`=p.`Item` | ||
JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a | LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield" GROUP BY `Area`,`Item`) y ON fao0.`Area`=y.`Area` AND fao0.`Item`=y.`Item` | ||
LEFT JOIN region ON | LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a ON fao0.`Area`=a.`Area` AND fao0.`Item`=a.`Item` | ||
LEFT JOIN pop ON | LEFT JOIN (SELECT *,MAX(`Continent_Name`)FROM region GROUP BY `Three_Letter_Country_Code`) reg0 ON fao0.`Area Code (ISO3)`= reg0.`Three_Letter_Country_Code` -- The GROUP BY is in there because otherwise: some country codes will match 2 rows (because of countries that span 2 continents) which would cause double-counting and fuck up the aggregation. Sorry Russia, you don't get both Europe and Asia lol | ||
LEFT JOIN ghi ON | LEFT JOIN pop ON fao0.`Area Code (ISO3)` = pop.`Country Code` -- for region_population | ||
LEFT JOIN ghi ON fao0.`Area` = ghi.`Country` -- for Global Hunger Index | |||
); | ); | ||
-- Population of the continents | |||
CREATE TEMPORARY TABLE cont_pop AS | |||
SELECT region_cont, SUM(DISTINCT region_population) AS region_cp -- The 'cp' stands for "continent population" | |||
FROM almost_main | |||
GROUP BY region_cont; | |||
-- Join it all together to create 'main' | |||
CREATE TABLE main AS | |||
SELECT * FROM almost_main LEFT JOIN cont_pop USING(region_cont); -- This adds the column 'region_cp' which is fully correlated with all the other columns that start with 'region_' | |||
-- Done, get rid of temporary tables | |||
DROP TABLE cont_pop; | |||
DROP TABLE almost_main; | |||
--- end of mandatory intro code to run --- | --- end of mandatory intro code to run --- | ||
Line 146: | Line 159: | ||
Line 213: | Line 179: | ||
FROM main GROUP BY region_name HAVING region_population; | FROM main GROUP BY region_name HAVING region_population; | ||
Line 263: | Line 216: | ||
-- Scenario: Every country grows mostly its best yielding crops -- | --==Scenario: Every country grows mostly its best yielding crops==-- | ||
-- | -- Farm land is reallocated in a way that favors high-yielding crops. | ||
-- | -- Each crop's land use will be proportional to the square of its yield (as scored above). | ||
-- | -- Total farmland remains the same. | ||
CREATE TEMPORARY TABLE scored AS | CREATE TEMPORARY TABLE scored AS | ||
SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score | SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score | ||
FROM main | FROM main | ||
WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out | WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out. | ||
AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. | AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. So mushrooms are removed from here, but are added back below, with the same production/yield/areaharvested as the status quo. | ||
UPDATE scored SET score = score/2 WHERE crop_name="Garlic"; -- Garlic has to be nerfed a bit too, because otherwise it'll recommend a whole pound of garlic per day per person! | |||
CREATE TEMPORARY TABLE scenario1 AS | CREATE TEMPORARY TABLE scenario1 AS | ||
Line 281: | Line 236: | ||
INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation | INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation | ||
DROP TABLE scored; | DROP TABLE scored; -- TABLE scenario1 is ready to be used now... | ||
-- Text report, by country | -- Text report, by country | ||
Line 288: | Line 244: | ||
" farmland: %.2f acres per person"||CHAR(10)|| | " farmland: %.2f acres per person"||CHAR(10)|| | ||
" crops per day per person:"||CHAR(10)|| | " crops per day per person:"||CHAR(10)|| | ||
" Status quo : %d kalories, %d grams of protein"||CHAR(10)|| | " Status quo : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)|| | ||
" New scenario: %d kalories, %d grams of protein"||CHAR(10)|| | " New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)|| | ||
" | " Top yielding crop: %s"||CHAR(10), | ||
region_name, | region_name, | ||
SUM(farmland_acres) /region_population, | SUM(farmland_acres) /region_population, | ||
SUM(production_kalories_per_day) /region_population, | SUM(production_kalories_per_day) /region_population, | ||
SUM(production_protein_grams_per_day) /region_population, | SUM(production_protein_grams_per_day) /region_population, | ||
SUM(production_fat_grams_per_day) /region_population, | |||
SUM(new_acres*yield_kalories_per_day_per_acre) /region_population, | SUM(new_acres*yield_kalories_per_day_per_acre) /region_population, | ||
SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population | SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population, | ||
SUM(new_acres*yield_fat_grams_per_day_per_acre) /region_population | |||
+ MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop | + MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop | ||
crop_name) | crop_name) | ||
Line 302: | Line 260: | ||
GROUP BY region_name; | GROUP BY region_name; | ||
-- | -- Text report, by continent | ||
SELECT PRINTF( | |||
"%s"||CHAR(10)|| | |||
" farmland: %.2f acres per person"||CHAR(10)|| | |||
" crops per day per person:"||CHAR(10)|| | |||
" Status quo : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)|| | |||
" New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10), | |||
region_cont, | |||
SUM(farmland_acres) /region_cp, -- | |||
SUM(production_kalories_per_day) /region_cp, -- | |||
SUM(production_protein_grams_per_day) /region_cp, -- divide by | |||
SUM(production_fat_grams_per_day) /region_cp, -- continent population | |||
SUM(new_acres*yield_kalories_per_day_per_acre) /region_cp, -- | |||
SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_cp, -- | |||
SUM(new_acres*yield_fat_grams_per_day_per_acre) /region_cp) -- | |||
FROM scenario1 | |||
GROUP BY region_cont; | |||
-- Text report, world summary | |||
SELECT PRINTF( | SELECT PRINTF( | ||
"World"||CHAR(10)|| | "World"||CHAR(10)|| | ||
Line 317: | Line 293: | ||
SUM(new_acres*yield_fat_grams_per_day_per_acre) /7.9e9) | SUM(new_acres*yield_fat_grams_per_day_per_acre) /7.9e9) | ||
FROM scenario1; | FROM scenario1; | ||
-- Data for generating maps | -- Data for generating maps | ||
-- food- | -- food-crop-production1-if-crop-choices.png | ||
SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres)) | SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres)) | ||
FROM scenario1 | FROM scenario1 | ||
GROUP BY region_m49 | GROUP BY region_m49 | ||
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file crop- | HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields-if-crop-choices | ||
-- Also make a similar map of status quo, for comparison | -- Also make a similar map of status quo, for comparison | ||
-- food- | -- food-crop-production1.png | ||
SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres)) | SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres)) | ||
FROM scenario1 | FROM scenario1 | ||
GROUP BY region_m49 | GROUP BY region_m49 | ||
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file | HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields | ||
-- Nutrition calculator links | |||
-- Food crops per day per capita | |||
-- For each of these, use the results as input for: make-nutrition0.9-url-from-sql-results.html | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f, food_name | SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f, food_name | ||
FROM scenario1 | FROM scenario1 -- World | ||
GROUP BY food_name | GROUP BY food_name ORDER BY f DESC LIMIT 70; | ||
ORDER BY f DESC; -- | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="Africa" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="Asia" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="Europe" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="North America" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="Oceania" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name | |||
FROM scenario1 WHERE region_cont="South America" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
-- Also generate links for the status quo, for comparison: | |||
SELECT SUM(production_food_grams_per_day)/7.9e9 AS f, food_name | |||
FROM main WHERE region_name<>"World" -- summing it ourselves instead of using the pre-summed 'World' rows | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="Africa" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="Asia" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="Europe" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="North America" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="Oceania" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name | |||
FROM main WHERE region_cont="South America" | |||
GROUP BY food_name ORDER BY f DESC LIMIT 70; | |||
-- Done with the temporary table | |||
DROP TABLE scenario1; | |||
Line 394: | Line 426: | ||
SELECT AVG(`Value`) FROM fao | SELECT AVG(`Value`) FROM fao | ||
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested"; | WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested"; | ||
--- MISC --- | |||
-- How much of wheat comes from Ukraine and Russia | |||
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population | |||
FROM main WHERE crop_name="Wheat" AND region_name="World"; | |||
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population | |||
FROM main WHERE crop_name="Wheat" AND region_name="Ukraine"; | |||
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population | |||
FROM main WHERE crop_name="Wheat" AND region_name="Russian Federation"; | |||
------------------------ | |||
----- CODE TESTING ----- | |||
-- Test out the fao_usda cross-reference table -- | |||
SELECT CHAR(10)|| | |||
"Crop name: "||`Item`||CHAR(10)|| | |||
"Food name: "||`Long_Desc`||CHAR(10)|| | |||
"Harvest (as defined) is "||((100-`Refuse`)*`Multiplier`)||"% food."||CHAR(10) | |||
FROM fao | |||
JOIN fao_usda USING(`Item Code (FAO)`) | |||
JOIN usda USING(`NDB_No`) | |||
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018"; | |||
-- Test out the fao_usda cross-reference table, again -- | |||
SELECT CHAR(10)|| | |||
'FAO "'||`Item`||'"'||CHAR(10)|| | |||
' -' ||(100-100*`Multiplier`)||'% ='||CHAR(10)|| | |||
'USDA "'||`Long_Desc`||'"'||CHAR(10)|| | |||
' -' ||`Refuse`||'% "'||`Ref_desc`||'" ='||CHAR(10)|| | |||
'Edible portion.'||CHAR(10) | |||
FROM fao | |||
JOIN fao_usda USING(`Item Code (FAO)`) | |||
JOIN usda USING(`NDB_No`) | |||
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018"; | |||
-- Test that "World" is the only aggregate -- | |||
-- Each pair of sums should match - they don't have to match perfectly but should be pretty close | |||
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name = "World"; | |||
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name <> "World"; | |||
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name = "World"; | |||
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name <> "World"; | |||
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name = "World"; | |||
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name <> "World"; | |||
-- Percent difference between sums (of production, for each crop): | |||
SELECT crop_name, CAST(((a-b)/b)*100 AS INT)||"%" | |||
FROM (SELECT SUM(production_food_grams_per_day) a, crop_name FROM main WHERE region_name<>"World" GROUP BY crop_name) | |||
JOIN (SELECT SUM(production_food_grams_per_day) b, crop_name FROM main WHERE region_name= "World" GROUP BY crop_name) | |||
USING(crop_name) | |||
ORDER BY crop_name; | |||
-- No two regions have identical population counts. [This fact allows us to use SUM(DISTINCT region_population) for aggregating regions. This was used in the creation of table 'main', to get the population of continents (main.region_cp)] | |||
-- Verify that it's true: | |||
SELECT region_population, region_name FROM main | |||
GROUP BY region_name | |||
ORDER BY CAST(region_population AS INT); | |||
</syntaxhighlight> | </syntaxhighlight> |