Code:food1.sql: Difference between revisions

no edit summary
(Created page with "<syntaxhighlight lang="sql"> -- This is used in the methodology of various olam.wiki pages -- Copyright Elie Goldman Smith, 2022 (Creative Commons license) -- Run this in SQLite3 ---------------------- -- Load data from various sources .mode csv -- cross-reference: crops (FAO data) with foods (USDA nutrition data): .import fao_usda.csv fao_usda -- nutrition (USDA): .import data/sr28-collated.csv...")
 
No edit summary
Line 14: Line 14:
--  crop production (FAO):
--  crop production (FAO):
.import data/primary/fao/crops-primary.csv                    fao
.import data/primary/fao/crops-primary.csv                    fao
--  country codes (FAO):
.import data/primary/fao/FAOSTAT-country-codes.csv            region
--  population of regions (World Bank):
--  population of regions (World Bank):
.import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop
.import data/primary/API_SP.POP.TOTL_DS2_en_csv_v2_4150863.csv pop
--  global hunger index (.com)
--  global hunger index (.com)
.import data/global-hunger-index-2021.csv                      ghi
.import data/global-hunger-index-2021.csv                      ghi
--  region codes:
.import data/primary/country-and-continent-codes-list-csv_csv.csv  region
.mode list
.mode list


Line 40: Line 40:




-- Generate a secondary table for easier querying.
-- Generate a secondary table 'main' for easier querying.
-- It's big, and it intentionally contains some redundancy.
-- It's big, and it intentionally contains some redundancy.
CREATE TABLE main AS
CREATE TEMPORARY TABLE almost_main AS
SELECT
SELECT
  region_name, region_m49, region_iso3, region_population, region_ghi,
  region_name, region_iso3, region_m49, region_population, region_cont, region_ghi,-- these 6 columns are always correlated
  crop_name, food_name, food_code,
  crop_name, food_name, food_code,                                                 -- these 3 columns are always correlated
  production_food_grams_per_day,
  production_food_grams_per_day,
  production_food_grams_per_day * fat_fraction_of_food      AS production_fat_grams_per_day,
  production_food_grams_per_day * fat_fraction_of_food      AS production_fat_grams_per_day,
Line 57: Line 57:
FROM (
FROM (
  SELECT
  SELECT
   region.`M49 Code`  AS region_m49,
   fao0.`Area Code (ISO3)`AS region_iso3, -- Technically `Area Code (ISO3)` is a "bare column", but it ALWAYS correlates with `Area`, so it's all good.
  p.`Area Code (ISO3)`AS region_iso3, -- Technically this is a "bare column", but it ALWAYS correlates with `Area`, so it's all good.
   fao0.`Area`            AS region_name,
   p.`Area`            AS region_name,
   fao0.`Item`            AS crop_name,
   p.`Item`            AS crop_name,
   usda.`Long_desc`       AS food_name,
   usda.`Long_desc`   AS food_name,
   usda.`NDB_No`         AS food_code,
   usda.`NDB_No`       AS food_code,
  reg0.`Country_Number`  AS region_m49,
  reg0.`Continent_Name`  AS region_cont,


   -- population from most recent year available:
   -- population from most recent year available:
   COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`)
   COALESCE(pop.`2021`,pop.`2020`,pop.`2019`,pop.`2018`,pop.`2017`,pop.`2016`,pop.`2015`,pop.`2014`,pop.`2013`,pop.`2012`,pop.`2011`)
   AS region_population,
   AS region_population,


   -- global hunger index from most recent year available:
   -- global hunger index from most recent year available:
Line 131: Line 131:
  FROM usda
  FROM usda
  JOIN fao_usda USING(`NDB_No`)
  JOIN fao_usda USING(`NDB_No`)
  JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production"    GROUP BY `Area`,`Item`) p   USING(`Item Code (FAO)`)
  JOIN      (SELECT *                      FROM fao                                  GROUP BY `Area`,`Item`) fao0 USING(`Item Code (FAO)`) -- Technically `Item Code (FAO)` is a "bare column", but it ALWAYS correlates with `Item`, so it's all good.
  JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield"          GROUP BY `Area`,`Item`) y   USING(`Area`,`Item`)
LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Production"    GROUP BY `Area`,`Item`) p   ON fao0.`Area`=p.`Area` AND fao0.`Item`=p.`Item`
  JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a   USING(`Area`,`Item`)
  LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Yield"          GROUP BY `Area`,`Item`) y   ON fao0.`Area`=y.`Area` AND fao0.`Item`=y.`Item`
  LEFT JOIN region ON p.`Area Code (ISO3)` = region.`ISO3 Code`
  LEFT JOIN (SELECT *, AVG(`Value`) AS val FROM fao WHERE `Element`="Area harvested" GROUP BY `Area`,`Item`) a   ON fao0.`Area`=a.`Area` AND fao0.`Item`=a.`Item`
  LEFT JOIN pop    ON p.`Area Code (ISO3)` = pop.`Country Code`
  LEFT JOIN (SELECT *,MAX(`Continent_Name`)FROM region               GROUP  BY  `Three_Letter_Country_Code`) reg0 ON fao0.`Area Code (ISO3)`= reg0.`Three_Letter_Country_Code` -- The GROUP BY is in there because otherwise: some country codes will match 2 rows (because of countries that span 2 continents) which would cause double-counting and fuck up the aggregation. Sorry Russia, you don't get both Europe and Asia lol
  LEFT JOIN ghi    ON p.`Area` = ghi.`Country`
  LEFT JOIN pop    ON fao0.`Area Code (ISO3)` = pop.`Country Code` -- for region_population
  LEFT JOIN ghi    ON fao0.`Area`             = ghi.`Country`     -- for Global Hunger Index
);
);


-- Population of the continents
CREATE TEMPORARY TABLE cont_pop AS
SELECT region_cont, SUM(DISTINCT region_population) AS region_cp -- The 'cp' stands for "continent population"
FROM almost_main
GROUP BY region_cont;


-- Join it all together to create 'main'
CREATE TABLE main AS
SELECT * FROM almost_main LEFT JOIN cont_pop USING(region_cont); -- This adds the column 'region_cp' which is fully correlated with all the other columns that start with 'region_'
-- Done, get rid of temporary tables
DROP TABLE cont_pop;
DROP TABLE almost_main;


--- end of mandatory intro code to run ---
--- end of mandatory intro code to run ---
Line 146: Line 159:




-- Test out the fao_usda cross-reference table --
SELECT CHAR(10)||
      "Crop name: "||`Item`||CHAR(10)||
      "Food name: "||`Long_Desc`||CHAR(10)||
      "Harvest (as defined) is "||((100-`Refuse`)*`Multiplier`)||"% food."||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda    USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test out the fao_usda cross-reference table --
SELECT CHAR(10)||
      'FAO  "'||`Item`||'"'||CHAR(10)||
      '  -'  ||(100-100*`Multiplier`)||'% ='||CHAR(10)||
      'USDA "'||`Long_Desc`||'"'||CHAR(10)||
      '  -'  ||`Refuse`||'% "'||`Ref_desc`||'" ='||CHAR(10)||
      'Edible portion.'||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda    USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test that "World" is the only aggregate --
--  All the other rows should add up to "World"
--  Numbers don't have to match perfectly but should be pretty close
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name <> "World";
-- Global primary food production, per capita, to be shown in nutrition calculator --
SELECT production_food_grams_per_day / 7.9e9, food_name
FROM main WHERE region_name="World"
ORDER BY production_food_grams_per_day DESC;




Line 213: Line 179:
FROM main GROUP BY region_name HAVING region_population;
FROM main GROUP BY region_name HAVING region_population;


-- How much of wheat comes from Ukraine and Russia
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="World";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Ukraine";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Russian Federation";




Line 263: Line 216:




-- Scenario: Every country grows mostly its best yielding crops --
--==Scenario: Every country grows mostly its best yielding crops==--
-- Farm land is reallocated in a way that favors high-yielding crops.
--   Farm land is reallocated in a way that favors high-yielding crops.
-- Each crop's land use will be proportional to the square of its yield (as scored above).
--   Each crop's land use will be proportional to the square of its yield (as scored above).
-- Total farmland remains the same.
--   Total farmland remains the same.
 
CREATE TEMPORARY TABLE scored AS
CREATE TEMPORARY TABLE scored AS
SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score
SELECT *, yield_kalories_per_day_per_acre/2000 + yield_protein_grams_per_day_per_acre/60 AS score
FROM main
FROM main
WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out
WHERE region_name <> "World" -- "World" is the only aggregated region in the dataset; leave it out.
AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. And besides, who would want a diet of mostly mushrooms? Anyway, so mushrooms are removed from here, but are added back below, with the same production/yield/areaharvested as the status quo.
AND crop_name <> "Mushrooms and truffles"; -- Mushrooms have exceptionally high yields, but this is because they grow without sun - they get their energy from the [[biomass waste]] of other food crops, generally. If you don't omit mushrooms, they end up being like 80% of the whole food supply. But this is not [[Term:viable|]] because there wouldn't be much biomass waste from other crops to grow the mushrooms in. So mushrooms are removed from here, but are added back below, with the same production/yield/areaharvested as the status quo.
UPDATE scored SET score = score/2 WHERE crop_name="Garlic"; -- Garlic has to be nerfed a bit too, because otherwise it'll recommend a whole pound of garlic per day per person!


CREATE TEMPORARY TABLE scenario1 AS
CREATE TEMPORARY TABLE scenario1 AS
Line 281: Line 236:


INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation
INSERT INTO scenario1 SELECT *, 0 AS score, 0 AS ratio, farmland_acres AS new_acres FROM main WHERE crop_name = "Mushrooms and truffles" AND region_name <> "World"; -- see above for explanation
DROP TABLE scored;
DROP TABLE scored; -- TABLE scenario1 is ready to be used now...
 


-- Text report, by country
-- Text report, by country
Line 288: Line 244:
  " farmland: %.2f acres per person"||CHAR(10)||
  " farmland: %.2f acres per person"||CHAR(10)||
  " crops per day per person:"||CHAR(10)||
  " crops per day per person:"||CHAR(10)||
  "  Status quo  : %d kalories, %d grams of protein"||CHAR(10)||
  "  Status quo  : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
  "  New scenario: %d kalories, %d grams of protein"||CHAR(10)||
  "  New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
  "     Top crop: %s"||CHAR(10),
  "   Top yielding crop: %s"||CHAR(10),
  region_name,
  region_name,
  SUM(farmland_acres)                                /region_population,
  SUM(farmland_acres)                                /region_population,
  SUM(production_kalories_per_day)                  /region_population,
  SUM(production_kalories_per_day)                  /region_population,
  SUM(production_protein_grams_per_day)              /region_population,
  SUM(production_protein_grams_per_day)              /region_population,
SUM(production_fat_grams_per_day)                  /region_population,
  SUM(new_acres*yield_kalories_per_day_per_acre)    /region_population,
  SUM(new_acres*yield_kalories_per_day_per_acre)    /region_population,
  SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population
  SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_population,
SUM(new_acres*yield_fat_grams_per_day_per_acre)    /region_population
  + MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop
  + MAX(score)*0, -- tells sqlite to use the name of the top-scoring crop
  crop_name)
  crop_name)
Line 302: Line 260:
GROUP BY region_name;
GROUP BY region_name;


-- Global summary
-- Text report, by continent
SELECT PRINTF(
"%s"||CHAR(10)||
" farmland: %.2f acres per person"||CHAR(10)||
" crops per day per person:"||CHAR(10)||
"  Status quo  : %d kalories, %d grams of protein, %d grams of fat"||CHAR(10)||
"  New scenario: %d kalories, %d grams of protein, %d grams of fat"||CHAR(10),
region_cont,
SUM(farmland_acres)                                /region_cp, --
SUM(production_kalories_per_day)                  /region_cp, --
SUM(production_protein_grams_per_day)              /region_cp, -- divide by
SUM(production_fat_grams_per_day)                  /region_cp, -- continent population
SUM(new_acres*yield_kalories_per_day_per_acre)    /region_cp, --
SUM(new_acres*yield_protein_grams_per_day_per_acre)/region_cp, --
SUM(new_acres*yield_fat_grams_per_day_per_acre)    /region_cp) --
FROM scenario1
GROUP BY region_cont;
 
-- Text report, world summary
SELECT PRINTF(
SELECT PRINTF(
  "World"||CHAR(10)||
  "World"||CHAR(10)||
Line 317: Line 293:
  SUM(new_acres*yield_fat_grams_per_day_per_acre)    /7.9e9)
  SUM(new_acres*yield_fat_grams_per_day_per_acre)    /7.9e9)
FROM scenario1;
FROM scenario1;


-- Data for generating maps
-- Data for generating maps
--  food-production-primary1-if-crop-choices.png
--  food-crop-production1-if-crop-choices.png
SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres))
SELECT region_m49 ||' '|| (SUM(new_acres*yield_kalories_per_day_per_acre)/SUM(farmland_acres))
FROM scenario1
FROM scenario1
GROUP BY region_m49
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file crop-yields-scenario1
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields-if-crop-choices


-- Also make a similar map of status quo, for comparison
-- Also make a similar map of status quo, for comparison
--  food-production-primary1.png
--  food-crop-production1.png
SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres))
SELECT region_m49 ||' '|| (SUM(production_kalories_per_day)/SUM(farmland_acres))
FROM scenario1
FROM scenario1
GROUP BY region_m49
GROUP BY region_m49
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file crop-yields
HAVING region_m49; -- use the results as input for: ./number-pairs-to-array-file yields
 
 
-- Nutrition calculator links
--  Food crops per day per capita
--  For each of these, use the results as input for:  make-nutrition0.9-url-from-sql-results.html


-- Data for nutrition calculator
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f,  food_name
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/7.9e9 AS f,  food_name
FROM scenario1
FROM scenario1                 -- World
GROUP BY food_name
GROUP BY food_name ORDER BY f DESC LIMIT 70;
ORDER BY f DESC; -- use the results as input for: make-nutrition0.9-url-from-sql-results.html
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(new_acres*yield_food_grams_per_day_per_acre)/region_cp AS f,  food_name
FROM scenario1 WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
-- Also generate links for the status quo, for comparison:
 
SELECT SUM(production_food_grams_per_day)/7.9e9 AS f,  food_name
FROM main WHERE region_name<>"World" -- summing it ourselves instead of using the pre-summed 'World' rows
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Africa"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Asia"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="Europe"
GROUP BY food_name ORDER BY f DESC LIMIT 70;
 
SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="North America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;


-- Done
SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
DROP TABLE scenario1;
FROM main WHERE region_cont="Oceania"
GROUP BY food_name ORDER BY f DESC LIMIT 70;


SELECT SUM(production_food_grams_per_day)/region_cp AS f,  food_name
FROM main WHERE region_cont="South America"
GROUP BY food_name ORDER BY f DESC LIMIT 70;


-- Done with the temporary table
DROP TABLE scenario1;




Line 394: Line 426:
SELECT AVG(`Value`) FROM fao
SELECT AVG(`Value`) FROM fao
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested";
WHERE `Area`="World" AND `Item`="Seed cotton" AND `Element`="Area harvested";
--- MISC ---
-- How much of wheat comes from Ukraine and Russia
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="World";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Ukraine";
SELECT production_food_grams_per_day / 7.9e9 -- per capita, world population
FROM main WHERE crop_name="Wheat" AND region_name="Russian Federation";
------------------------
----- CODE TESTING -----
-- Test out the fao_usda cross-reference table --
SELECT CHAR(10)||
      "Crop name: "||`Item`||CHAR(10)||
      "Food name: "||`Long_Desc`||CHAR(10)||
      "Harvest (as defined) is "||((100-`Refuse`)*`Multiplier`)||"% food."||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda    USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test out the fao_usda cross-reference table, again --
SELECT CHAR(10)||
      'FAO  "'||`Item`||'"'||CHAR(10)||
      '  -'  ||(100-100*`Multiplier`)||'% ='||CHAR(10)||
      'USDA "'||`Long_Desc`||'"'||CHAR(10)||
      '  -'  ||`Refuse`||'% "'||`Ref_desc`||'" ='||CHAR(10)||
      'Edible portion.'||CHAR(10)
FROM fao
JOIN fao_usda USING(`Item Code (FAO)`)
JOIN usda    USING(`NDB_No`)
WHERE `Area`="World" AND `Element`="Production" AND `Year`="2018";
-- Test that "World" is the only aggregate --
--  Each pair of sums should match - they don't have to match perfectly but should be pretty close
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_food_grams_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name = "World";
SELECT SUM(production_kalories_per_day) FROM main WHERE region_name <> "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name = "World";
SELECT SUM(DISTINCT region_population) FROM main WHERE region_name <> "World";
--  Percent difference between sums (of production, for each crop):
SELECT crop_name, CAST(((a-b)/b)*100 AS INT)||"%"
FROM (SELECT SUM(production_food_grams_per_day) a, crop_name FROM main WHERE region_name<>"World" GROUP BY crop_name)
JOIN (SELECT SUM(production_food_grams_per_day) b, crop_name FROM main WHERE region_name= "World" GROUP BY crop_name)
USING(crop_name)
ORDER BY crop_name;
-- No two regions have identical population counts. [This fact allows us to use SUM(DISTINCT region_population) for aggregating regions. This was used in the creation of table 'main', to get the population of continents (main.region_cp)]
--  Verify that it's true:
SELECT region_population, region_name FROM main
GROUP BY region_name
ORDER BY CAST(region_population AS INT);
</syntaxhighlight>
</syntaxhighlight>