-- This is used in the methodology of various olam.wiki pages
-- Copyright Elie Goldman Smith, 2022 (Creative Commons license)
-- Run this in SQLite3
----------------------
-- Load data from various sources
.mode csv
-- cross-reference: crops (FAO data) with foods (USDA nutrition data):
.import fao_usda.csv fao_usda
-- nutrition (USDA):
.import data/sr28-collated.csv usda
-- crop production (FAO):
.import data/primary/fao/crops-primary.csv fao
-- population of regions (World Bank):
.import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop
-- global hunger index (.com)
.import data/global-hunger-index-2021.csv ghi
-- region codes:
.import data/primary/country-and-continent-codes-list-csv_csv.csv region
.mode list
-- Fix some of the region codes, to match the ISO3 standard
UPDATE pop SET `Country Code`="X01" WHERE `Country Code`="WLD";
-- Replace some empty strings with NULLs, so aggregate functions work
UPDATE fao SET `Value`=NULL WHERE `Value`='';
UPDATE pop SET `2021`=NULL WHERE `2021`='';
UPDATE pop SET `2020`=NULL WHERE `2020`='';
UPDATE pop SET `2019`=NULL WHERE `2019`='';
UPDATE pop SET `2018`=NULL WHERE `2018`='';
UPDATE pop SET `2017`=NULL WHERE `2017`='';
UPDATE pop SET `2016`=NULL WHERE `2016`='';
UPDATE pop SET `2015`=NULL WHERE `2015`='';
UPDATE pop SET `2014`=NULL WHERE `2014`='';
UPDATE pop SET `2013`=NULL WHERE `2013`='';
UPDATE pop SET `2012`=NULL WHERE `2012`='';
UPDATE pop SET `2011`=NULL WHERE `2011`='';
-- Generate a secondary table 'main' for easier querying.
-- It's big, and it intentionally contains some redundancy.
CREATE TEMPORARY TABLE almost_main AS
SELECT
region_name, region_iso3, region_m49, region_population, region_cont, region_ghi,-- these 6 columns are always correlated
crop_name, food_name, food_code, -- these 3 columns are always correlated
production_food_grams_per_day,
production_food_grams_per_day * fat_fraction_of_food AS production_fat_grams_per_day,
production_food_grams_per_day * protein_fraction_of_food AS production_protein_grams_per_day,
production_food_grams_per_day * kalories_per_gram_of_food AS production_kalories_per_day,
yield_food_grams_per_day_per_acre,
yield_food_grams_per_day_per_acre * fat_fraction_of_food AS yield_fat_grams_per_day_per_acre,
yield_food_grams_per_day_per_acre * protein_fraction_of_food AS yield_protein_grams_per_day_per_acre,
yield_food_grams_per_day_per_acre * kalories_per_gram_of_food AS yield_kalories_per_day_per_acre,
farmland_acres
FROM (
SELECT
fao0.`Area Code (ISO3)`AS region_iso3, -- Technically `Area Code (ISO3)` is a "bare column", but it ALWAYS correlates with `Area`, so it's all good.
fao0.`Area` AS region_name,
fao0.`Item` AS crop_name,
usda.`Long_desc` AS food_name,
usda.`NDB_No` AS food_code,
reg0.`Country_Number` AS region_m49,
reg0.`Continent_Name` AS region_cont,
-- population from most recent year available:
COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`)
AS region_population,
-- global hunger index from most recent year available:
ghi.`2021` AS region_ghi,
-- crop production...
-- in tonnes/year:
p.val
-- edible fraction of crop:
* fao_usda.`Multiplier` * 0.01*(100 - usda.`Refuse`)
-- conversion factor: "tonnes/year" to "grams/day":
* 2737.9093
-- result:
AS production_food_grams_per_day,
-- crop yields...
-- in hectograms/year/hectare:
y.val
-- edible fraction of crop:
* fao_usda.`Multiplier` * 0.01*(100 - usda.`Refuse`)
-- conversion factor: "hectograms/year/hectare" to "grams/day/acre":
* 0.11079926
-- result:
AS yield_food_grams_per_day_per_acre,
-- crop land...
-- in hectares:
a.val
-- conversion factor: "hectares" to "acres":
* 2.4710538
-- result:
AS farmland_acres,
-- protein...
-- grams per 100g food:
usda.`Protein (g)`
-- conversion factor:
* 0.01
-- result:
AS protein_fraction_of_food,
-- fat...
-- grams per 100g food:
usda.`Total lipid (fat) (g)`
-- conversion factor:
* 0.01
-- result:
AS fat_fraction_of_food,
-- kalories...
-- per 100g food:
usda.`Energy (kcal)`
-- conversion factor:
* 0.01
-- result:
AS kalories_per_gram_of_food
FROM usda
JOIN fao_usda USING(`NDB_No`)
JOIN (SELECT * FROM fao GROUP BY `Area`,`Item`) fao0 USING(`Item Code (FAO)`) -- Technically `Item Code (FAO)` is a "bare column", but it ALWAYS correlates with `Item`, so it's all good.
LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production" GROUP BY `Area`,`Item`) p ON fao0.`Area`=p.`Area` AND fao0.`Item`=p.`Item`
LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield" GROUP BY `Area`,`Item`) y ON fao0.`Area`=y.`Area` AND fao0.`Item`=y.`Item`
LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a ON fao0.`Area`=a.`Area` AND fao0.`Item`=a.`Item`
LEFT JOIN (SELECT *,MAX(`Continent_Name`)FROM region GROUP BY `Three_Letter_Country_Code`) reg0 ON fao0.`Area Code (ISO3)`= reg0.`Three_Letter_Country_Code` -- The GROUP BY is in there because otherwise: some country codes will match 2 rows (because of countries that span 2 continents) which would cause double-counting and fuck up the aggregation. Sorry Russia, you don't get both Europe and Asia lol
LEFT JOIN pop ON fao0.`Area Code (ISO3)` = pop.`Country Code` -- for region_population
LEFT JOIN ghi ON fao0.`Area` = ghi.`Country` -- for Global Hunger Index
);
-- Population of the continents
CREATE TEMPORARY TABLE cont_pop AS
SELECT region_cont, SUM(DISTINCT region_population) AS region_cp -- The 'cp' stands for "continent population"
FROM almost_main
GROUP BY region_cont;
-- Join it all together to create 'main'
CREATE TABLE main AS
SELECT * FROM almost_main LEFT JOIN cont_pop USING(region_cont); -- This adds the column 'region_cp' which is fully correlated with all the other columns that start with 'region_'
-- Done, get rid of temporary tables
DROP TABLE cont_pop;
DROP TABLE almost_main;
--- end of mandatory intro code to run ---
------------------------------------------
-- Overview, by country --
SELECT
CHAR(10)||
region_name||CHAR(10)||
PRINTF(" %d kalories/day/person (primary)"||CHAR(10)||
" %d grams protein/day/person (primary)"||CHAR(10)||
" farmland: %.2f acres/person"||CHAR(10),
SUM(production_kalories_per_day) / region_population,
SUM(production_protein_grams_per_day) / region_population,
SUM(farmland_acres) / region_population)||
CASE WHEN region_ghi IS NOT NULL AND region_ghi <> ''
THEN ">hunger index: "||region_ghi||CHAR(10)
ELSE "" END
FROM main GROUP BY region_name HAVING region_population;
----- olam.wiki/Crop_choices -----
--==Best yielding crops for each region==
-- Generates a wiki table (but you still have to add the {| and |} manually).
-- Ranked from best (left) to worst (right).
-- Ranking system: 2000 kalories has the same "value" as 60 grams of protein.
-- .once -e
SELECT
"|'''"||region_name||"'''"||CHAR(10)||
"|.<br />.<br /> kalories/day/acre <br /> g protein/day/acre "||CHAR(10)||
PRINTF("|''Status quo<br />all crops averaged''<br />%d<br />%d",
SUM(production_kalories_per_day )/SUM(farmland_acres),
SUM(production_protein_grams_per_day)/SUM(farmland_acres)
) ||CHAR(10)||
GROUP_CONCAT(
PRINTF("|'''%s'''<br />%d<br />%d",
crop_name, yield_kalories_per_day_per_acre, yield_protein_grams_per_day_per_acre
), CHAR(10)
) ||CHAR(10)||'|-'
FROM (
SELECT * FROM main
ORDER BY yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 DESC
)
GROUP BY region_name;
--==Scenario: Every country grows mostly its best yielding crops==--
-- Farm land is reallocated in a way that favors high-yielding crops.
-- Each crop's land use will be proportional to the square of its yield (as scored above).
-- Total farmland remains the same.
CREATE TEMPORARY TABLE scored AS
SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score
FROM main
WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out.
AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. So mushrooms are removed from here, but are added back below, with the same production/yield/areaharvested as the status quo.
UPDATE scored SET score = score/2 WHERE crop_name="Garlic"; -- Garlic has to be nerfed a bit too, because otherwise it'll recommend a whole pound of garlic per day per person!
CREATE TEMPORARY TABLE scenario1 AS
SELECT *, score*score*ratio AS new_acres
FROM scored JOIN (
SELECT region_name, SUM(farmland_acres)/SUM(score*score) AS ratio
FROM scored GROUP BY region_name
) USING(region_name);
INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation
DROP TABLE scored; -- TABLE scenario1 is ready to be used now...
-- Text report, by country
SELECT PRINTF(
"%s"||CHAR(10)||
" farmland: %.2f acres per person"||CHAR(10)||
" crops per day per person:"||CHAR(10)||
" Status quo : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
" New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
" Top yielding crop: %s"||CHAR(10),
region_name,
SUM(farmland_acres) /region_population,
SUM(production_kalories_per_day) /region_population,
SUM(production_protein_grams_per_day) /region_population,
SUM(production_fat_grams_per_day) /region_population,
SUM(new_acres*yield_kalories_per_day_per_acre) /region_population,
SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population,
SUM(new_acres*yield_fat_grams_per_day_per_acre) /region_population
+ MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop
crop_name)
FROM scenario1
GROUP BY region_name;
-- Text report, by continent
SELECT PRINTF(
"%s"||CHAR(10)||
" farmland: %.2f acres per person"||CHAR(10)||
" crops per day per person:"||CHAR(10)||
" Status quo : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
" New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10),
region_cont,
SUM(farmland_acres) /region_cp, --
SUM(production_kalories_per_day) /region_cp, --
SUM(production_protein_grams_per_day) /region_cp, -- divide by
SUM(production_fat_grams_per_day) /region_cp, -- continent population
SUM(new_acres*yield_kalories_per_day_per_acre) /region_cp, --
SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_cp, --
SUM(new_acres*yield_fat_grams_per_day_per_acre) /region_cp) --
FROM scenario1
GROUP BY region_cont;
-- Text report, world summary
SELECT PRINTF(
"World"||CHAR(10)||
" farmland: %.2f acres per person"||CHAR(10)||
" crops per day per person:"||CHAR(10)||
" Status quo : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
" New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10),
SUM(farmland_acres) /7.9e9,
SUM(production_kalories_per_day) /7.9e9,
SUM(production_protein_grams_per_day) /7.9e9,-- divide by
SUM(production_fat_grams_per_day) /7.9e9,-- world population
SUM(new_acres*yield_kalories_per_day_per_acre) /7.9e9,
SUM(new_acres*yield_protein_grams_per_day_per_acre)/7.9e9,
SUM(new_acres*yield_fat_grams_per_day_per_acre) /7.9e9)
FROM scenario1;
-- Data for generating maps
-- food-crop-production1-if-crop-choices.png
SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres))
FROM scenario1
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields-if-crop-choices
-- Also make a similar map of status quo, for comparison
-- food-crop-production1.png
SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres))
FROM scenario1
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields
-- Nutrition calculator links
-- Food crops per day per capita
-- For each of these, use the results as input for: make-nutrition0.9-url-from-sql-results.html
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f, food_name
FROM scenario1 -- World
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f, food_name
FROM scenario1 WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
-- Also generate links for the status quo, for comparison:
SELECT SUM(production_food_grams_per_day)/7.9e9 AS f, food_name
FROM main WHERE region_name<>"World" -- summing it ourselves instead of using the pre-summed 'World' rows
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
SELECT SUM(production_food_grams_per_day)/region_cp AS f, food_name
FROM main WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
-- Done with the temporary table
DROP TABLE scenario1;
-- olam.wiki/Deforestation --
-- Crop land trends --
SELECT PRINTF("%s:"||CHAR(9)||"%s%.1f%%/year",
`Area`,
CASE WHEN y2020 > y2018 THEN "+" ELSE "" END,
100.0 * (y2020 - y2018) / (2020 - 2018) / y2018)
FROM (SELECT SUM(`Value`) AS y2018, `Area` FROM fao WHERE `Element`="Area harvested" AND `Year`="2018" GROUP BY `Area`)
JOIN (SELECT SUM(`Value`) AS y2020, `Area` FROM fao WHERE `Element`="Area harvested" AND `Year`="2020" GROUP BY `Area`) USING (`Area`);
-- olam.wiki/Clothing --
-- General overview of fibre crops --
-- global production, in grams per day per capita
-- Note: some of these aren't fibre crops, it's any crops that aren't in the fao_usda cross-reference table
SELECT AVG(`Value`) -- tonnes/year
* 2737.9093 -- convert to grams/day
/ 7.9e9 -- per capita (divide by world population)
AS g,
`Item`
FROM fao WHERE `Area`="World" AND `Element`="Production" AND (
`Item Code (FAO)` NOT IN (SELECT `Item Code (FAO)` FROM fao_usda)
OR `Item`="Seed cotton")
GROUP BY `Item`
ORDER BY g;
-- keep in mind that an average t-shirt weighs 142 grams.
-- olam.wiki/calc - Data points generated for the calculator --
-- cotton.production, tonnes/year (to be fair, a more accurate number might be found in crops-processed.csv)
SELECT AVG(`Value`*(1-`Multiplier`))
FROM fao JOIN fao_usda USING (`Item Code (FAO)`)
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Production";
-- cotton.farmland, hectares
SELECT AVG(`Value`) FROM fao
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested";
--- MISC ---
-- How much of wheat comes from Ukraine and Russia
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="World";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Ukraine";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Russian Federation";
------------------------
----- CODE TESTING -----
-- Test out the fao_usda cross-reference table --
SELECT CHAR(10)||
"Crop name: "||`Item`||CHAR(10)||
"Food name: "||`Long_Desc`||CHAR(10)||
"Harvest (as defined) is "||((100-`Refuse`)*`Multiplier`)||"% food."||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test out the fao_usda cross-reference table, again --
SELECT CHAR(10)||
'FAO "'||`Item`||'"'||CHAR(10)||
' -' ||(100-100*`Multiplier`)||'% ='||CHAR(10)||
'USDA "'||`Long_Desc`||'"'||CHAR(10)||
' -' ||`Refuse`||'% "'||`Ref_desc`||'" ='||CHAR(10)||
'Edible portion.'||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test that "World" is the only aggregate --
-- Each pair of sums should match - they don't have to match perfectly but should be pretty close
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name = "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name <> "World";
-- Percent difference between sums (of production, for each crop):
SELECT crop_name, CAST(((a-b)/b)*100 AS INT)||"%"
FROM (SELECT SUM(production_food_grams_per_day) a, crop_name FROM main WHERE region_name<>"World" GROUP BY crop_name)
JOIN (SELECT SUM(production_food_grams_per_day) b, crop_name FROM main WHERE region_name= "World" GROUP BY crop_name)
USING(crop_name)
ORDER BY crop_name;
-- No two regions have identical population counts. [This fact allows us to use SUM(DISTINCT region_population) for aggregating regions. This was used in the creation of table 'main', to get the population of continents (main.region_cp)]
-- Verify that it's true:
SELECT region_population, region_name FROM main
GROUP BY region_name
ORDER BY CAST(region_population AS INT);