From 178c80316a0196fb7bf868a4ea5bc58c81044239 Mon Sep 17 00:00:00 2001
From: Tim Daly
Date: Sat, 26 Mar 2016 17:52:42 -0400
Subject: [PATCH] books/bookvolbib Elmr00 Recursive QR Factorization
Goal: Axiom Literate Programming
@article{Elmr00,
author = "Elmroth, E. and Gustavson, F. G.",
title = "Applying recursion to serial and parallel QR factorization leads
to better performance",
journal = "IBM Journal of Research and Development",
volume = "44",
number = "4",
month = "July",
year = "2000",
pages = "605--624",
doi = "10.1.1.33.1820",
url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.1820",
abstract =
"We present new recursive serial and parallel algorithms for QR
factorization of an $m$ by $n$ matrix. They improve performance. The
recursion leads to an automatic variable blocking, and it also
replaces a Level 2 part in a standard aglorithm with Level 3
operations. However, there are significant additional costs for
creating and performing the updates, which prohibit the efficient use
of the recursion for large $n$. We present a quantitative analysis of
these extra costs. This analysis leads us to introduce a hybrid
recursive algorithm that outperforms the LAPACK algorithm DGEQRF by
about 20\% for large square matrices up to almost a factor of 3 for
tall thin matrices. Uniprocessor performance results are presented for
two IBM RS/6000 SP nodes -- a 120-Mhz IBM POWER2 node and one
processor of a four-way 332-Mhz IBM PowerPC 604e SMP node. The hybrid
recursive algorithm reaches more than 90\% of the theoretical peak
performance of the POWER2 node. Compared to standard block algorithms,
the recursive approach also shows a significant advantage in the
automatic tuning obtained from its automatic variable blocking. A
successful parallel implementation on a four-way 332-MHz IBM PPC604e
SMP node based on dynamic load balancing is presented. For two, three,
and four processors it shows speedups of up to 1.97, 299, and 3.97."
}
---
books/bookvolbib.pamphlet | 40 ++++++++++++++++++++++++++
changelog | 2 +
patch | 60 +++++++++++++++++++++++----------------
src/axiom-website/patches.html | 2 +
4 files changed, 79 insertions(+), 25 deletions(-)
diff --git a/books/bookvolbib.pamphlet b/books/bookvolbib.pamphlet
index 96e5259..c2e44f4 100644
--- a/books/bookvolbib.pamphlet
+++ b/books/bookvolbib.pamphlet
@@ -2012,6 +2012,46 @@ when shown in factored form.
\end{chunk}
+\index{Elmroth, E.}
+\index{Gustavson, F. G.}
+\begin{chunk}{axiom.bib}
+@article{Elmr00,
+ author = "Elmroth, E. and Gustavson, F. G.",
+ title = "Applying recursion to serial and parallel QR factorization leads
+ to better performance",
+ journal = "IBM Journal of Research and Development",
+ volume = "44",
+ number = "4",
+ month = "July",
+ year = "2000",
+ pages = "605--624",
+ doi = "10.1.1.33.1820",
+ url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.1820",
+ abstract =
+ "We present new recursive serial and parallel algorithms for QR
+ factorization of an $m$ by $n$ matrix. They improve performance. The
+ recursion leads to an automatic variable blocking, and it also
+ replaces a Level 2 part in a standard aglorithm with Level 3
+ operations. However, there are significant additional costs for
+ creating and performing the updates, which prohibit the efficient use
+ of the recursion for large $n$. We present a quantitative analysis of
+ these extra costs. This analysis leads us to introduce a hybrid
+ recursive algorithm that outperforms the LAPACK algorithm DGEQRF by
+ about 20\% for large square matrices up to almost a factor of 3 for
+ tall thin matrices. Uniprocessor performance results are presented for
+ two IBM RS/6000 SP nodes -- a 120-Mhz IBM POWER2 node and one
+ processor of a four-way 332-Mhz IBM PowerPC 604e SMP node. The hybrid
+ recursive algorithm reaches more than 90\% of the theoretical peak
+ performance of the POWER2 node. Compared to standard block algorithms,
+ the recursive approach also shows a significant advantage in the
+ automatic tuning obtained from its automatic variable blocking. A
+ successful parallel implementation on a four-way 332-MHz IBM PPC604e
+ SMP node based on dynamic load balancing is presented. For two, three,
+ and four processors it shows speedups of up to 1.97, 299, and 3.97."
+}
+
+\end{chunk}
+
\index{Fateman, Richard J.}
\begin{chunk}{axiom.bib}
@misc{Fate13,
diff --git a/changelog b/changelog
index 4663256..788ef78 100644
--- a/changelog
+++ b/changelog
@@ -1,3 +1,5 @@
+20160326 tpd src/axiom-website/patches.html 20160326.02.tpd.patch
+20160326 tpd books/bookvolbib Elmr00 Recursive QR Factorization
20160326 tpd src/axiom-website/patches.html 20160326.01.tpd.patch
20160326 tpd books/bookvolbib Demm08 Householder QR
20160325 tpd src/axiom-website/patches.html 20160325.04.tpd.patch
diff --git a/patch b/patch
index ad5e115..acd6bdb 100644
--- a/patch
+++ b/patch
@@ -1,28 +1,38 @@
-books/bookvolbib Demm08 Householder QR
+books/bookvolbib Elmr00 Recursive QR Factorization
Goal: Axiom Literate Programming
-@techreport{Demm08,
- author = "Demmel, James and Hoemmen, Mark and Hida, Yozo
- and Riedy, E. Jason",
- title = "Non-Negative Diagonals and High Performance on Low-Profile
- Matrices from Householder QR",
- year = "2008",
- institution = "Univerity of California, Berkeley",
- type = "Technical Report",
- number = "203",
- paper = "Demm08.pdf",
- url = "http://www.netlib.org/lapack/lawnspdf/lawn203.pdf",
- abstract =
- "The Householder reflections used in LAPACK's QR factorization leave
- positive and negative real entries along R's diagonal. This is
- sufficient for most applications of QR factorizations, but a few
- require that R have a non-negative diagonal. This note provides a new
- Householder generation routine to produce a non-negative
- diagonal. Additionally, we find that scanning for trailing zeros in
- the generated reflections leads to large performance improvements when
- applying reflections with many trailing zeros. Factoring low-profile
- matrices, those with non-zero entries mostly near the diagonal (e.g
- band matrices), now requires far fewer operations. For example, QR
- factorization of matrices with profile width $b$ that are stored
- densely in an $n \cross n$ matrix improves form $O(n^3)$ to $O(n^2+nb^2)$."
+@article{Elmr00,
+ author = "Elmroth, E. and Gustavson, F. G.",
+ title = "Applying recursion to serial and parallel QR factorization leads
+ to better performance",
+ journal = "IBM Journal of Research and Development",
+ volume = "44",
+ number = "4",
+ month = "July",
+ year = "2000",
+ pages = "605--624",
+ doi = "10.1.1.33.1820",
+ url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.1820",
+ abstract =
+ "We present new recursive serial and parallel algorithms for QR
+ factorization of an $m$ by $n$ matrix. They improve performance. The
+ recursion leads to an automatic variable blocking, and it also
+ replaces a Level 2 part in a standard aglorithm with Level 3
+ operations. However, there are significant additional costs for
+ creating and performing the updates, which prohibit the efficient use
+ of the recursion for large $n$. We present a quantitative analysis of
+ these extra costs. This analysis leads us to introduce a hybrid
+ recursive algorithm that outperforms the LAPACK algorithm DGEQRF by
+ about 20\% for large square matrices up to almost a factor of 3 for
+ tall thin matrices. Uniprocessor performance results are presented for
+ two IBM RS/6000 SP nodes -- a 120-Mhz IBM POWER2 node and one
+ processor of a four-way 332-Mhz IBM PowerPC 604e SMP node. The hybrid
+ recursive algorithm reaches more than 90\% of the theoretical peak
+ performance of the POWER2 node. Compared to standard block algorithms,
+ the recursive approach also shows a significant advantage in the
+ automatic tuning obtained from its automatic variable blocking. A
+ successful parallel implementation on a four-way 332-MHz IBM PPC604e
+ SMP node based on dynamic load balancing is presented. For two, three,
+ and four processors it shows speedups of up to 1.97, 299, and 3.97."
+}
diff --git a/src/axiom-website/patches.html b/src/axiom-website/patches.html
index 27c15f0..132c578 100644
--- a/src/axiom-website/patches.html
+++ b/src/axiom-website/patches.html
@@ -5258,6 +5258,8 @@ books/bookvolbib add Demm05 LAPACK Working Note 165

src/input/groeb2.input Demonstrate Groebner basis

20160326.01.tpd.patch
books/bookvolbib Demm08 Householder QR

+20160326.02.tpd.patch
+books/bookvolbib Elmr00 Recursive QR Factorization

--
1.7.5.4