8.8

Show how to represent the matrices used for computing PageRank as relations. Then write an SQL query that implements one iterative step of the iterative technique for finding PageRank; the entire algorithm can then be implemented as a loop containing the query.


The following query has been tested on Postgresql server version 13.7.

CREATE TABLE jump_probability_matrix (
    -- T[i, j] set to the probability that a random walker 
    -- who is following a link out of page i follows the 
    -- link to page j.
    T REAL[][]      
);

CREATE TABLE all_webpages ( 
    k INTEGER PRIMARY KEY,      -- web pages are given integer identifiers.
    page_rank REAL              -- the page rank. 
);

-- The function compute_page_rank_of_webpage() takes an integer paramter j 
-- and updates the table all_webpages, to reflect the new page rank of web page j. 
-- Look at Section "8.3.2.2 Ranking Using Hyperlinks" of the book.
CREATE FUNCTION compute_page_rank_of_webpage(j integer) RETURNS REAL AS $$
DECLARE
    -- N is the number of webpages. 
    N CONSTANT integer := (SELECT COUNT(*) FROM all_webpages);
    retval real := 0;
    temp real := 0;
BEGIN
    -- We have chose sigma to equal 0.15. 
    retval = 0.15 / N; 

    FOR i IN 1..N LOOP
        temp = temp + (
            (SELECT T[i][j] FROM jump_probability_matrix) * (SELECT page_rank FROM all_webpages WHERE k = i)
        );
    END LOOP;

    -- since sigma is 0.15, 1 - sigma is equal to 0.85
    temp = 0.85 * temp;
    retval = retval + temp;

    UPDATE all_webpages
    SET page_rank = retval
    WHERE k = j;

    RETURN retval;
END;
$$ LANGUAGE plpgsql;


CREATE FUNCTION compute_page_rank_of_all_webpages() RETURNS void AS $$
DECLARE
    -- N is the number of webpages. 
    N CONSTANT integer := (SELECT COUNT(*) FROM all_webpages);
BEGIN
    FOR i IN 1..N LOOP
        PERFORM compute_page_rank_of_webpage(i);
    END LOOP;
END;
$$ LANGUAGE plpgsql;

If you want to insert mock data to the above relations and try PageRank algorithm, use the following:-

-- Initally all webpages have the same page_rank. 
-- That is we assume all pages are equally popular.
INSERT INTO all_webpages (k, page_rank) VALUES 
    (1, 0.25),
    (2, 0.25),
    (3, 0.25),
    (4, 0.25);

-- I randomly generated a 4x4 matrix using numpy. 
-- used this: numpy.random.random((4, 4))
INSERT INTO jump_probability_matrix VALUES 
    (ARRAY[  
       [0.97707627, 0.17602739, 0.49650485, 0.34760888],
       [0.50885387, 0.53501379, 0.42921831, 0.65003344],
       [0.28540161, 0.57376113, 0.2110252 , 0.68359475],
       [0.31909116, 0.03811033, 0.25396854, 0.64540672]
    ]);

Now running,

SELECT * FROM all_webpages;

we get:-

 k | page_rank 
---+-----------
 1 |      0.25
 2 |      0.25
 3 |      0.25
 4 |      0.25
(4 rows)

Let’s perform the PageRank algorithm, as shown below:

DO $$                      
BEGIN
 PERFORM compute_page_rank_of_all_webpages();
END;
$$;

Now if we select the relation all_webpages we get the following:

 k | page_rank  
---+------------
 1 | 0.48171484
 2 | 0.35328886
 3 | 0.46850118
 4 |  0.7844074
(4 rows)