Code:food1.sql

From the change wiki
Revision as of 12:28, 13 August 2022 by Elie (talk | contribs)
-- This is used in the methodology of various olam.wiki pages
-- Copyright Elie Goldman Smith, 2022 (Creative Commons license)
-- Run this in SQLite3
----------------------


-- Load data from various sources
.mode csv
--  cross-reference: crops (FAO data) with foods (USDA nutrition data):
.import fao_usda.csv                                           fao_usda
--  nutrition (USDA):
.import data/sr28-collated.csv                                 usda
--  crop production (FAO):
.import data/primary/fao/crops-primary.csv                     fao
--  population of regions (World Bank):
.import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop
--  global hunger index (.com)
.import data/global-hunger-index-2021.csv                      ghi
--  region codes:
.import data/primary/country-and-continent-codes-list-csv_csv.csv  region
.mode list

-- Fix some of the region codes, to match the ISO3 standard
UPDATE pop SET `Country Code`="X01" WHERE `Country Code`="WLD";

-- Replace some empty strings with NULLs, so aggregate functions work
UPDATE fao SET `Value`=NULL WHERE `Value`='';
UPDATE pop SET `2021`=NULL WHERE `2021`='';
UPDATE pop SET `2020`=NULL WHERE `2020`='';
UPDATE pop SET `2019`=NULL WHERE `2019`='';
UPDATE pop SET `2018`=NULL WHERE `2018`='';
UPDATE pop SET `2017`=NULL WHERE `2017`='';
UPDATE pop SET `2016`=NULL WHERE `2016`='';
UPDATE pop SET `2015`=NULL WHERE `2015`='';
UPDATE pop SET `2014`=NULL WHERE `2014`='';
UPDATE pop SET `2013`=NULL WHERE `2013`='';
UPDATE pop SET `2012`=NULL WHERE `2012`='';
UPDATE pop SET `2011`=NULL WHERE `2011`='';


-- Generate a secondary table 'main' for easier querying.
-- It's big, and it intentionally contains some redundancy.
CREATE TEMPORARY TABLE almost_main AS
SELECT
 region_name, region_iso3, region_m49, region_population, region_cont, region_ghi,-- these 6 columns are always correlated
 crop_name, food_name, food_code,                                                 -- these 3 columns are always correlated
 production_food_grams_per_day,
 production_food_grams_per_day * fat_fraction_of_food      AS production_fat_grams_per_day,
 production_food_grams_per_day * protein_fraction_of_food  AS production_protein_grams_per_day,
 production_food_grams_per_day * kalories_per_gram_of_food AS production_kalories_per_day,
 yield_food_grams_per_day_per_acre,
 yield_food_grams_per_day_per_acre * fat_fraction_of_food      AS yield_fat_grams_per_day_per_acre,
 yield_food_grams_per_day_per_acre * protein_fraction_of_food  AS yield_protein_grams_per_day_per_acre,
 yield_food_grams_per_day_per_acre * kalories_per_gram_of_food AS yield_kalories_per_day_per_acre,
 farmland_acres
FROM (
 SELECT
  fao0.`Area Code (ISO3)`AS region_iso3, -- Technically `Area Code (ISO3)` is a "bare column", but it ALWAYS correlates with `Area`, so it's all good.
  fao0.`Area`            AS region_name,
  fao0.`Item`            AS crop_name,
  usda.`Long_desc`       AS food_name,
  usda.`NDB_No`          AS food_code,
  reg0.`Country_Number`  AS region_m49,
  reg0.`Continent_Name`  AS region_cont,

  -- population from most recent year available:
  COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`)
  AS region_population,

  -- global hunger index from most recent year available:
  ghi.`2021`          AS region_ghi,


  -- crop production...
  -- in tonnes/year:
  p.val
  -- edible fraction of crop:
  * fao_usda.`Multiplier` * 0.01*(100 - usda.`Refuse`)
  -- conversion factor: "tonnes/year" to "grams/day":
  * 2737.9093
  -- result:
  AS production_food_grams_per_day,


  -- crop yields...
  -- in hectograms/year/hectare:
  y.val
  -- edible fraction of crop:
  * fao_usda.`Multiplier` * 0.01*(100 - usda.`Refuse`)
  -- conversion factor: "hectograms/year/hectare" to "grams/day/acre":
  * 0.11079926
  -- result:
  AS yield_food_grams_per_day_per_acre,
 

  -- crop land...
  -- in hectares:
  a.val
  -- conversion factor: "hectares" to "acres":
  * 2.4710538
  -- result:
  AS farmland_acres,


  -- protein...
  -- grams per 100g food:
  usda.`Protein (g)`
  -- conversion factor:
  * 0.01
  -- result:
  AS protein_fraction_of_food,


  -- fat...
  -- grams per 100g food:
  usda.`Total lipid (fat) (g)`
  -- conversion factor:
  * 0.01
  -- result:
  AS fat_fraction_of_food,


  -- kalories...
  -- per 100g food:
  usda.`Energy (kcal)`
  -- conversion factor:
  * 0.01
  -- result:
  AS kalories_per_gram_of_food 
 FROM usda
 JOIN fao_usda USING(`NDB_No`)
 JOIN      (SELECT *                      FROM fao                                  GROUP BY `Area`,`Item`) fao0 USING(`Item Code (FAO)`) -- Technically `Item Code (FAO)` is a "bare column", but it ALWAYS correlates with `Item`, so it's all good.
 LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production"     GROUP BY `Area`,`Item`) p    ON fao0.`Area`=p.`Area` AND fao0.`Item`=p.`Item`
 LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield"          GROUP BY `Area`,`Item`) y    ON fao0.`Area`=y.`Area` AND fao0.`Item`=y.`Item`
 LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a    ON fao0.`Area`=a.`Area` AND fao0.`Item`=a.`Item`
 LEFT JOIN (SELECT *,MAX(`Continent_Name`)FROM region               GROUP  BY  `Three_Letter_Country_Code`) reg0 ON fao0.`Area Code (ISO3)`= reg0.`Three_Letter_Country_Code` -- The GROUP BY is in there because otherwise: some country codes will match 2 rows (because of countries that span 2 continents) which would cause double-counting and fuck up the aggregation. Sorry Russia, you don't get both Europe and Asia lol
 LEFT JOIN pop    ON fao0.`Area Code (ISO3)` = pop.`Country Code` -- for region_population
 LEFT JOIN ghi    ON fao0.`Area`             = ghi.`Country`      -- for Global Hunger Index
);

-- Population of the continents
CREATE TEMPORARY TABLE cont_pop AS
SELECT region_cont, SUM(DISTINCT region_population) AS region_cp -- The 'cp' stands for "continent population"
FROM almost_main
GROUP BY region_cont;

-- Join it all together to create 'main'
CREATE TABLE main AS
SELECT * FROM almost_main LEFT JOIN cont_pop USING(region_cont); -- This adds the column 'region_cp' which is fully correlated with all the other columns that start with 'region_'

-- Done, get rid of temporary tables
DROP TABLE cont_pop;
DROP TABLE almost_main;

--- end of mandatory intro code to run ---
------------------------------------------








-- Overview, by country --
SELECT
 CHAR(10)||
 region_name||CHAR(10)||
 PRINTF(" %d kalories/day/person     (primary)"||CHAR(10)||
        " %d grams protein/day/person  (primary)"||CHAR(10)||
        " farmland: %.2f acres/person"||CHAR(10),
        SUM(production_kalories_per_day)      / region_population,
        SUM(production_protein_grams_per_day) / region_population,
        SUM(farmland_acres)                   / region_population)||
 CASE WHEN region_ghi IS NOT NULL AND region_ghi <> ''
      THEN ">hunger index: "||region_ghi||CHAR(10)
      ELSE "" END
FROM main GROUP BY region_name HAVING region_population;









----- olam.wiki/Crop_choices -----


--==Best yielding crops for each region==
--   Generates a wiki table (but you still have to add the {| and |} manually).
--    Ranked from best (left) to worst (right).
--     Ranking system: 2000 kalories has the same "value" as 60 grams of protein.
-- .once -e
SELECT
 "|'''"||region_name||"'''"||CHAR(10)||
 "|.<br />.<br />&nbsp;kalories/day/acre&nbsp;<br />&nbsp;g&nbsp;protein/day/acre&nbsp;"||CHAR(10)||
 PRINTF("|''Status&nbsp;quo<br />all&nbsp;crops&nbsp;averaged''<br />%d<br />%d",
  SUM(production_kalories_per_day     )/SUM(farmland_acres),
  SUM(production_protein_grams_per_day)/SUM(farmland_acres)
 ) ||CHAR(10)||
 GROUP_CONCAT(
  PRINTF("|'''%s'''<br />%d<br />%d",
   crop_name, yield_kalories_per_day_per_acre, yield_protein_grams_per_day_per_acre
  ), CHAR(10)
 ) ||CHAR(10)||'|-'
FROM (
 SELECT * FROM main
 ORDER BY yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 DESC
)
GROUP BY region_name;




--==Scenario: Every country grows mostly its best yielding crops==--
--   Farm land is reallocated in a way that favors high-yielding crops.
--   Each crop's land use will be proportional to the square of its yield (as scored above).
--   Total farmland remains the same.

CREATE TEMPORARY TABLE scored AS
SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score
FROM main
WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out.
AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. So mushrooms are removed from here, but are added back below, with the same production/yield/areaharvested as the status quo.
UPDATE scored SET score = score/2 WHERE crop_name="Garlic"; -- Garlic has to be nerfed a bit too, because otherwise it'll recommend a whole pound of garlic per day per person!

CREATE TEMPORARY TABLE scenario1 AS
SELECT *, score*score*ratio AS new_acres
FROM scored JOIN (
 SELECT region_name, SUM(farmland_acres)/SUM(score*score) AS ratio
 FROM scored GROUP BY region_name
) USING(region_name);

INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation
DROP TABLE scored; -- TABLE scenario1 is ready to be used now...


-- Text report, by country
SELECT PRINTF(
 "%s"||CHAR(10)||
 " farmland: %.2f acres per person"||CHAR(10)||
 " crops per day per person:"||CHAR(10)||
 "  Status quo  : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
 "  New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
 "    Top yielding crop: %s"||CHAR(10),
 region_name,
 SUM(farmland_acres)                                /region_population,
 SUM(production_kalories_per_day)                   /region_population,
 SUM(production_protein_grams_per_day)              /region_population,
 SUM(production_fat_grams_per_day)                  /region_population,
 SUM(new_acres*yield_kalories_per_day_per_acre)     /region_population,
 SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population,
 SUM(new_acres*yield_fat_grams_per_day_per_acre)    /region_population
 + MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop
 crop_name)
FROM scenario1
GROUP BY region_name;

-- Text report, by continent
SELECT PRINTF(
 "%s"||CHAR(10)||
 " farmland: %.2f acres per person"||CHAR(10)||
 " crops per day per person:"||CHAR(10)||
 "  Status quo  : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
 "  New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10),
 region_cont,
 SUM(farmland_acres)                                /region_cp, --
 SUM(production_kalories_per_day)                   /region_cp, -- 
 SUM(production_protein_grams_per_day)              /region_cp, -- divide by
 SUM(production_fat_grams_per_day)                  /region_cp, -- continent population
 SUM(new_acres*yield_kalories_per_day_per_acre)     /region_cp, --
 SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_cp, --
 SUM(new_acres*yield_fat_grams_per_day_per_acre)    /region_cp) --
FROM scenario1
GROUP BY region_cont;

-- Text report, world summary
SELECT PRINTF(
 "World"||CHAR(10)||
 " farmland: %.2f acres per person"||CHAR(10)||
 " crops per day per person:"||CHAR(10)||
 "  Status quo  : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
 "  New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10),
 SUM(farmland_acres)                                /7.9e9,
 SUM(production_kalories_per_day)                   /7.9e9,
 SUM(production_protein_grams_per_day)              /7.9e9,-- divide by
 SUM(production_fat_grams_per_day)                  /7.9e9,-- world population
 SUM(new_acres*yield_kalories_per_day_per_acre)     /7.9e9,
 SUM(new_acres*yield_protein_grams_per_day_per_acre)/7.9e9,
 SUM(new_acres*yield_fat_grams_per_day_per_acre)    /7.9e9)
FROM scenario1;


-- Data for generating maps
--  food-crop-production1-if-crop-choices.png
SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres))
FROM scenario1
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields-if-crop-choices

-- Also make a similar map of status quo, for comparison
--  food-crop-production1.png
SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres))
FROM scenario1
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields


-- Nutrition calculator links
--  Food crops per day per capita
--   For each of these, use the results as input for:  make-nutrition0.9-url-from-sql-results.html

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f,  food_name
FROM scenario1                 -- World
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

-- Also generate links for the status quo, for comparison:

SELECT SUM(production_food_grams_per_day)/7.9e9 AS f,  food_name
FROM main WHERE region_name<>"World" -- summing it ourselves instead of using the pre-summed 'World' rows
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;

-- Done with the temporary table
DROP TABLE scenario1;











-- olam.wiki/Deforestation --

-- Crop land trends --
SELECT PRINTF("%s:"||CHAR(9)||"%s%.1f%%/year",
              `Area`,
              CASE WHEN y2020 > y2018 THEN "+" ELSE "" END,
              100.0 * (y2020 - y2018) / (2020 - 2018) / y2018)
FROM (SELECT SUM(`Value`) AS y2018, `Area` FROM fao WHERE `Element`="Area harvested" AND `Year`="2018" GROUP BY `Area`)
JOIN (SELECT SUM(`Value`) AS y2020, `Area` FROM fao WHERE `Element`="Area harvested" AND `Year`="2020" GROUP BY `Area`) USING (`Area`);




-- olam.wiki/Clothing --
-- General overview of fibre crops --
--  global production, in grams per day per capita
--  Note: some of these aren't fibre crops, it's any crops that aren't in the fao_usda cross-reference table
SELECT AVG(`Value`) -- tonnes/year
       * 2737.9093  -- convert to grams/day
       / 7.9e9      -- per capita (divide by world population)
       AS g,
       `Item`
FROM fao WHERE `Area`="World" AND `Element`="Production" AND (
               `Item Code (FAO)` NOT IN (SELECT `Item Code (FAO)` FROM fao_usda)
            OR `Item`="Seed cotton")
GROUP BY `Item`
ORDER BY g;
-- keep in mind that an average t-shirt weighs 142 grams.



-- olam.wiki/calc - Data points generated for the calculator --

-- cotton.production, tonnes/year (to be fair, a more accurate number might be found in crops-processed.csv)
SELECT AVG(`Value`*(1-`Multiplier`))
FROM fao JOIN fao_usda USING (`Item Code (FAO)`)
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Production";

-- cotton.farmland, hectares
SELECT AVG(`Value`) FROM fao
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested";






--- MISC ---

-- How much of wheat comes from Ukraine and Russia
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="World";

SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Ukraine";

SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Russian Federation";














------------------------
----- CODE TESTING -----





-- Test out the fao_usda cross-reference table --
SELECT CHAR(10)||
       "Crop name: "||`Item`||CHAR(10)||
       "Food name: "||`Long_Desc`||CHAR(10)||
       "Harvest (as defined) is "||((100-`Refuse`)*`Multiplier`)||"% food."||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda     USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";



-- Test out the fao_usda cross-reference table, again --
SELECT CHAR(10)||
       'FAO  "'||`Item`||'"'||CHAR(10)||
       '  -'   ||(100-100*`Multiplier`)||'% ='||CHAR(10)||
       'USDA "'||`Long_Desc`||'"'||CHAR(10)||
       '  -'   ||`Refuse`||'% "'||`Ref_desc`||'" ='||CHAR(10)||
       'Edible portion.'||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda     USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";



-- Test that "World" is the only aggregate --
--  Each pair of sums should match - they don't have to match perfectly but should be pretty close
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name = "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name <> "World";
--  Percent difference between sums (of production, for each crop):
SELECT crop_name, CAST(((a-b)/b)*100 AS INT)||"%"
FROM (SELECT SUM(production_food_grams_per_day) a, crop_name FROM main WHERE region_name<>"World" GROUP BY crop_name)
JOIN (SELECT SUM(production_food_grams_per_day) b, crop_name FROM main WHERE region_name= "World" GROUP BY crop_name)
USING(crop_name)
ORDER BY crop_name;




-- No two regions have identical population counts. [This fact allows us to use SUM(DISTINCT region_population) for aggregating regions. This was used in the creation of table 'main', to get the population of continents (main.region_cp)]
--  Verify that it's true:
SELECT region_population, region_name FROM main
GROUP BY region_name
ORDER BY CAST(region_population AS INT);