diff --git a/doc/TODO.detail/README b/doc/TODO.detail/README new file mode 100644 index 0000000000..1ca6bcf1af --- /dev/null +++ b/doc/TODO.detail/README @@ -0,0 +1,2 @@ +These files are in standard Unix mailbox format, and are detail +information related to the TODO list. diff --git a/doc/TODO.detail/alpha b/doc/TODO.detail/alpha new file mode 100644 index 0000000000..9f075a8389 --- /dev/null +++ b/doc/TODO.detail/alpha @@ -0,0 +1,107 @@ +From owner-pgsql-hackers@hub.org Fri May 14 16:00:46 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id QAA02173 + for ; Fri, 14 May 1999 16:00:44 -0400 (EDT) +Received: from hub.org (hub.org [209.167.229.1]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id QAA02824 for ; Fri, 14 May 1999 16:00:45 -0400 (EDT) +Received: from hub.org (hub.org [209.167.229.1]) + by hub.org (8.9.3/8.9.3) with ESMTP id PAA47798; + Fri, 14 May 1999 15:57:54 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 14 May 1999 15:54:30 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.9.3/8.9.3) id PAA47191 + for pgsql-hackers-outgoing; Fri, 14 May 1999 15:54:28 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from thelab.hub.org (nat194.147.mpoweredpc.net [142.177.194.147]) + by hub.org (8.9.3/8.9.3) with ESMTP id PAA46457 + for ; Fri, 14 May 1999 15:49:35 -0400 (EDT) + (envelope-from scrappy@hub.org) +Received: from localhost (scrappy@localhost) + by thelab.hub.org (8.9.3/8.9.1) with ESMTP id QAA16128; + Fri, 14 May 1999 16:49:44 -0300 (ADT) + (envelope-from scrappy@hub.org) +X-Authentication-Warning: thelab.hub.org: scrappy owned process doing -bs +Date: Fri, 14 May 1999 16:49:44 -0300 (ADT) +From: The Hermit Hacker +To: pgsql-hackers@postgreSQL.org +cc: Jack Howarth +Subject: [HACKERS] postgresql bug report (fwd) +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + + +Marc G. Fournier ICQ#7615664 IRC Nick: Scrappy +Systems Administrator @ hub.org +primary: scrappy@hub.org secondary: scrappy@{freebsd|postgresql}.org + +---------- Forwarded message ---------- +Date: Fri, 14 May 1999 14:50:58 -0400 +From: Jack Howarth +To: scrappy@hub.org +Subject: postgresql bug report + +Marc, + In porting the RedHat 6.0 srpm set for a linuxppc release we +believe a bug has been identified in +the postgresql source for 6.5-0.beta1. Our development tools are as +follows... + +glibc 2.1.1 pre 2 +linux 2.2.6 +egcs 1.1.2 +the latest binutils snapshot + +The bug that we see is that when egcs compiles postgresql at -O1 or +higher (-O0 is fine), +postgresql creates incorrectly formed databases such that when the user +does a destroydb +the database can not be destroyed. Franz Sirl has identified the problem +as follows... + + it seems that this problem is a type casting/promotion bug in the +source. The + routine _bt_checkkeys() in backend/access/nbtree/nbtutils.c calls +int2eq() in + backend/utils/adt/int.c via a function pointer +*fmgr_faddr(&key[0].sk_func). As + the type information for int2eq is lost via the function pointer, +the compiler + passes 2 ints, but int2eq expects 2 (preformatted in a 32bit reg) +int16's. + This particular bug goes away, if I for example change int2eq to: + + bool + int2eq(int32 arg1, int32 arg2) + { + return (int16)arg1 == (int16)arg2; + } + + This moves away the type casting/promotion "work" from caller to the +callee and + is probably the right thing to do for functions used via function +pointers. + +...because of the large number of changes required to do this, Franz +thought we should +pass this on to the postgresql maintainers for correction. Please feel +free to contact +Franz Sirl (Franz.Sirl-kernel@lauterbach.com) if you have any questions +on this bug +report. + +-- +------------------------------------------------------------------------------ +Jack W. Howarth, Ph.D. 231 Bethesda Avenue +NMR Facility Director Cincinnati, Ohio 45267-0524 +Dept. of Molecular Genetics phone: (513) 558-4420 +Univ. of Cincinnati College of Medicine fax: (513) 558-8474 + + + + + + diff --git a/doc/TODO.detail/arrays b/doc/TODO.detail/arrays new file mode 100644 index 0000000000..3c90814055 --- /dev/null +++ b/doc/TODO.detail/arrays @@ -0,0 +1,94 @@ +From owner-pgsql-hackers@hub.org Wed Nov 25 19:01:02 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA16399 + for ; Wed, 25 Nov 1998 19:01:01 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id SAA05250 for ; Wed, 25 Nov 1998 18:53:12 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id SAA17798; + Wed, 25 Nov 1998 18:49:38 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 25 Nov 1998 18:49:07 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id SAA17697 + for pgsql-hackers-outgoing; Wed, 25 Nov 1998 18:49:06 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from mail.enterprise.net (root@mail.enterprise.net [194.72.192.18]) + by hub.org (8.9.1/8.9.1) with ESMTP id SAA17650; + Wed, 25 Nov 1998 18:48:55 -0500 (EST) + (envelope-from olly@lfix.co.uk) +Received: from linda.lfix.co.uk (root@max01-040.enterprise.net [194.72.197.40]) + by mail.enterprise.net (8.8.5/8.8.5) with ESMTP id XAA20539; + Wed, 25 Nov 1998 23:48:52 GMT +Received: from linda.lfix.co.uk (olly@localhost [127.0.0.1]) + by linda.lfix.co.uk (8.9.1a/8.9.1/Debian/GNU) with ESMTP id XAA12089; + Wed, 25 Nov 1998 23:48:52 GMT +Message-Id: <199811252348.XAA12089@linda.lfix.co.uk> +X-Mailer: exmh version 2.0.2 2/24/98 (debian) +X-URL: http://www.lfix.co.uk/oliver +X-face: "xUFVDj+ZJtL_IbURmI}!~xAyPC"Mrk=MkAm&tPQnNq(FWxv49R}\>0oI8VM?O2VY+N7@F- + KMLl*!h}B)u@TW|B}6 +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +This was reported as a bug with the Debian package of 6.3.2; the same +behaviour is still present in 6.4. + +bray=> create table foo ( t text[]); +CREATE +bray=> insert into foo values ( '{"a"}'); +INSERT 201354 1 +bray=> insert into foo values ( '{"a","b"}'); +INSERT 201355 1 +bray=> insert into foo values ( '{"a","b","c"}'); +INSERT 201356 1 +bray=> select * from foo; +t +------------- +{"a"} +{"a","b"} +{"a","b","c"} +(3 rows) + +bray=> select t[1] from foo; +ERROR: type name lookup of t failed +bray=> select * from foo; +t +------------- +{"a"} +{"a","b"} +{"a","b","c"} +(3 rows) + +bray=> select foo.t[1] from foo; +t +- +a +a +a +(3 rows) + +bray=> select count(foo.t[1]) from foo; +pqReadData() -- backend closed the channel unexpectedly. + +-- +Oliver Elphick Oliver.Elphick@lfix.co.uk +Isle of Wight http://www.lfix.co.uk/oliver + PGP key from public servers; key ID 32B8FAA1 + ======================================== + "Let us therefore come boldly unto the throne of grace, + that we may obtain mercy, and find grace to help in + time of need." Hebrews 4:16 + + + + diff --git a/doc/TODO.detail/cnfify b/doc/TODO.detail/cnfify new file mode 100644 index 0000000000..c3b5ff92d7 --- /dev/null +++ b/doc/TODO.detail/cnfify @@ -0,0 +1,1556 @@ +From daybee@bellatlantic.net Sun Aug 23 20:21:48 1998 +Received: from iconmail.bellatlantic.net (iconmail.bellatlantic.net [199.173.162.30]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id UAA26688 + for ; Sun, 23 Aug 1998 20:21:46 -0400 (EDT) +Received: from bellatlantic.net (client196-126-169.bellatlantic.net [151.196.126.169]) + by iconmail.bellatlantic.net (IConNet Sendmail) with ESMTP id UAA09478; + Sun, 23 Aug 1998 20:18:35 -0400 (EDT) +Message-ID: <35E0ABF0.578694C8@bellatlantic.net> +Date: Sun, 23 Aug 1998 19:55:29 -0400 +From: David Hartwig +Organization: Home +X-Mailer: Mozilla 4.04 [en] (Win95; I) +MIME-Version: 1.0 +To: Bruce Momjian +CC: hannu@trust.ee, pgsql-interfaces@postgreSQL.org, hackers@postgreSQL.org +Subject: Re: [INTERFACES] Re: [HACKERS] changes in 6.4 +References: <199808220353.XAA04528@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: ROr + + + +Bruce Momjian wrote: + +> > +> > Hannu Krosing wrote: +> > +> > > > The days where every release fixed server crashes, or added a feature +> > > > that users were 'screaming for' may be a thing of the past. +> > > +> > > Is anyone working on fixing the exploding optimisations for many OR-s, +> > > at least the canonic case used by access? +> > > +> > > My impression is that this has fallen somewhere between +> > > insightdist and Vadim. +> > +> > This is really big for the ODBCers. (And I suspect for JDBCers too.) Many +> > desktop libraries and end-user tools depend on this "record set" strategy to +> > operate effectively. +> > +> > I have put together a workable hack that runs just before cnfify(). The +> > option is activated through the SET command. Once activated, it identifies +> > queries with this particular multi-OR pattern generated by these RECORD SET +> > strategies. Qualified query trees are rewritten as multiple UNIONs. (One +> > for each OR grouping). +> > +> > The results are profound. Queries that used to scan tables because of the +> > ORs, now make use of any indexes. Thus, the size of the table has virtually +> > no effect on performance. Furthermore, queries that used to crash the +> > backend, now run in under a second. +> > +> > Currently the down sides are: +> > 1. If there is no usable index, performance is significantly worse. The +> > patch does not check to make sure that there is a usable index. I could use +> > some pointers on this. +> > +> > 2. Small tables are actually a bit slower than without the patch. +> > +> > 3. Not very elegant. I am looking for a more generalized solution. +> > I have lots of ideas, but I would need to know the backend much better before +> > attempting any of them. My favorite idea is before cnfify(), to factor the +> > OR terms and pull out the constants into a virtual (temporary) table spaces. +> > Then rewrite the query as a join. The optimizer will (should) treat the new +> > query accordingly. This assumes that an efficient factoring algorithm exists +> > and that temporary tables can exist in the heap. +> > +> > Illustration: +> > SELECT ... FROM tab WHERE +> > (var1 = const1 AND var2 = const2) OR +> > (var1 = const3 AND var2 = const4) OR +> > (var1 = const5 AND var2 = const6) +> > +> > SELECT ... FROM tab, tmp WHERE +> > (var1 = var_x AND var2 = var_y) +> > +> > tmp +> > var_x | var_y +> > -------------- +> > const1|const2 +> > const3|const4 +> > const5|const6 +> +> David, where are we on this? I know we have OR's using indexes. Do we +> still need to look this as a fix, or are we OK. I have not gotten far +> enough in the optimizer to know how to fix the + +Bruce, + +If the question is, have I come up with a solution for the cnf'ify problem: No + +If the question is, is it still important: Very much yes. + +It is essential for many RAD tools using remote data objects which make use of key +sets. Your recent optimization of the OR list goes a long way, but inevitably +users are confronted with multi-part keys. + +When I look at the problem my head spins. I do not have the experience (yet?) +with the backend to be mucking around in the optimizer. As I see it, cnf'ify is +doing just what it is supposed to do. Boundless boolean logic. + +I think hope may lay though, in identifying each AND'ed group associated with a key +and tagging it as a special sub-root node which cnf'ify does not penetrate. This +node would be allowed to pass to the later stages of the optimizer where it will be +used to plan index scans. Easy for me to say. + +In the meantime, I still have the patch that I described in prior email. It has +worked well for us. Let me restate that. We could not survive without it! +However, I do not feel that is a sufficiently functional approach that should be +incorporated as a final solution. I will submit the patch if you, (anyone) does +not come up with a better solution. It is coded to be activated by a SET KSQO to +minimize its reach. + + +From daybee@bellatlantic.net Sun Aug 30 12:06:24 1998 +Received: from iconmail.bellatlantic.net (iconmail.bellatlantic.net [199.173.162.30]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id MAA12860 + for ; Sun, 30 Aug 1998 12:06:22 -0400 (EDT) +Received: from bellatlantic.net (client196-126-73.bellatlantic.net [151.196.126.73]) + by iconmail.bellatlantic.net (IConNet Sendmail) with ESMTP id MAA18468; + Sun, 30 Aug 1998 12:03:33 -0400 (EDT) +Message-ID: <35E9726E.C6E73049@bellatlantic.net> +Date: Sun, 30 Aug 1998 11:40:31 -0400 +From: David Hartwig +Organization: Home +X-Mailer: Mozilla 4.06 [en] (Win98; I) +MIME-Version: 1.0 +To: Bruce Momjian +CC: hannu@trust.ee, pgsql-interfaces@postgreSQL.org, hackers@postgreSQL.org +Subject: Re: [INTERFACES] Re: [HACKERS] changes in 6.4 +References: <199808290344.XAA28089@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: RO + + + +Bruce Momjian wrote: + +> OK, let me try this one. +> +> Why is the system cnf'ifying the query. Because it wants to have a +> list of qualifications that are AND'ed, so it can just pick the most +> restrictive/cheapest, and evaluate that one first. +> +> If you have: +> +> (a=b and c=d) or e=1 +> +> In this case, without cnf'ify, it has to evaluate both of them, because +> if one is false, you can't be sure another would be true. In the +> cnf'ify case, +> +> (a=b or e=1) and (c=d or e=1) +> +> In this case, it can choose either, and act on just one, if a row fails +> to meet it, it can stop and not evaluate it using the other restriction. +> +> The fact is that it is only going to use fancy join/index in one of the +> two cases, so it tries to pick the best one, and does a brute-force +> qualification test on the remaining item if the first one tried is true. +> +> The problem is of course large where clauses can exponentially expand +> this. What it really trying to do is to pick a cheapest restriction, +> but the memory explosion and query failure are serious problems. +> +> The issue is that it thinks it is doing something to help things, while +> it is actually hurting things. +> +> In the ODBC case of: +> +> (x=3 and y=4) or +> (x=3 and y=5) or +> (x=3 and y=6) or ... +> +> it clearly is not going to gain anything by choosing any CHEAPEST path, +> because they are all the same in terms of cost, and the use by ODBC +> clients is hurting reliability. +> +> I am inclined to agree with David's solution of breaking apart the query +> into separate UNION queries in certain cases. It seems to be the most +> logical solution, because the cnf'ify code is working counter to its +> purpose in these cases. +> +> Now, the question is how/where to implement this. I see your idea of +> making the OR a join to a temp table that holds all the constants. +> Another idea would be to do actual UNION queries: +> +> SELECT * FROM tab +> WHERE (x=3 and y=4) +> UNION +> SELECT * FROM tab +> WHERE (x=3 and y=5) +> UNION +> SELECT * FROM tab +> WHERE (x=3 and y=6) ... +> +> This would work well for tables with indexes, but for a sequential scan, +> you are doing a sequential scan for each UNION. + +Practically speaking, the lack of an index concern, may not be justified. The reason +these queries are being generated, with this shape, is because remote data objects on the +client side are being told that a primary key exists on these tables. The object is told +about these keys in one of two ways. + +1. It queries the database for the primary key of the table. The ODBC driver serviced +this request by querying for the attributes used in {table_name}_pkey. + +2. The user manually specifies the primary key. In this case an actual index may not +exist. (i.e. MS Access asks the user for this information if a primary key is not found +in a table) + +The second case is the only one that would cause a problem. Fortunately, the solution is +simple. Add a primary key index! + +My only concern is to be able to accurately identify a query with the proper signature +before rewriting it as a UNION. To what degree should this inspection be taken? + +BTW, I would not do the rewrite on OR's without AND's since you have fixed the OR's use +of the index. + +There is one other potential issue. My experience with using arrays in tables and UNIONS +creates problems. There are missing array comparison operators which are used by the +implied DISTINCT. + +> Another idea is +> subselects. Also, you have to make sure you return the proper rows, +> keeping duplicates where they are in the base table, but not returning +> them when the meet more than one qualification. +> +> SELECT * FROM tab +> WHERE (x,y) IN (SELECT 3, 4 +> UNION +> SELECT 3, 5 +> UNION +> SELECT 3, 6) +> +> I believe we actually support this. This is not going to use an index +> on tab, so it may be slow if x and y are indexed. +> +> Another more bizarre solution is: +> +> SELECT * FROM tab +> WHERE (x,y) = (SELECT 3, 4) OR +> (x,y) = (SELECT 3, 5) OR +> (x,y) = (SELECT 3, 6) +> +> Again, I think we do this too. I don't think cnf'ify does anything with +> this. I also believe "=" uses indexes on subselects, while IN does not +> because IN could return lots of rows, and an index is slower than a +> non-index join on lots of rows. Of course, now that we index OR's. +> +> Let me ask another question. If I do: +> +> SELECT * FROM tab WHERE x=3 OR x=4 +> +> it works, and uses indexes. Why can't the optimizer just not cnf'ify +> things sometimes, and just do: +> +> SELECT * FROM tab +> WHERE (x=3 AND y=4) OR +> (x=3 AND y=5) OR +> (x=3 AND y=6) +> +> Why can it handle x=3 OR x=4, but not the more complicated case above, +> without trying to be too smart? If x,y is a multi-key index, it could +> use that quite easily. If not, it can do a sequentail scan and run the +> tests. +> +> Another issue. To the optimizer, x=3 and x=y are totally different. In +> x=3, it is a column compared to a constant, while in x=y, it is a join. +> That makes a huge difference. +> +> In the case of (a=b and c=d) or e=1, you pick the best path and do the +> a=b join, and throw in the e=1 entries. You can't easily do both joins, +> because you also need the e=1 stuff. +> +> I wounder what would happen if we prevent cnf'ifying of cases where the +> OR represent only column = constant restrictions. +> +> I meant to really go through the optimizer this month, but other backend +> items took my time. +> +> Can someone run some tests on disabling the cnf'ify calls. It is my +> understanding that with the non-cnf-ify'ed query, it can't choose an +> optimial path, and starts to do either straight index matches, +> sequential scans, or cartesian products where it joins every row to +> every other row looking for a match. +> +> Let's say we turn off cnf-ify just for non-join queries. Does that +> help? +> +> I am not sure of the ramifications of telling the optimizer it no longer +> has a variety of paths to choose for evaluating the query. + +I did not try this earlier because I thought it was too good to be true. I was right. +I tried commenting out the normalize() function in the cnfify(). The EXPLAIN showed a +sequential scan and the resulting tuple set was empty. Time will not allow me to dig +into this further this weekend. + +Unless you come up with a better solution, I am going to submit my patch on Monday to +make the Sept. 1st deadline. It includes a SET switch to activate the rewrite so as not +to cause problems outside the ODBC users. We can either improve, it or yank it, by the +Oct. 1st deadline. + + +From infotecn@tin.it Mon Aug 31 03:01:51 1998 +Received: from mail.tol.it (mail.tin.it [194.243.154.49]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id DAA09740 + for ; Mon, 31 Aug 1998 03:01:48 -0400 (EDT) +Received: from Server.InfoTecna.com (a-mz6-50.tin.it [212.216.9.113]) + by mail.tol.it (8.8.4/8.8.4) with ESMTP + id JAA16451; Mon, 31 Aug 1998 09:00:35 +0200 (MET DST) +Received: from tm3.InfoTecna.com (Tm1.InfoTecna.com [192.168.1.1]) + by Server.InfoTecna.com (8.8.5/8.8.5) with SMTP id IAA18678; + Mon, 31 Aug 1998 08:53:13 +0200 +Message-Id: <3.0.5.32.19980831085312.00986cc0@MBox.InfoTecna.com> +X-Sender: denis@MBox.InfoTecna.com +X-Mailer: QUALCOMM Windows Eudora Light Version 3.0.5 (32) +Date: Mon, 31 Aug 1998 08:53:12 +0200 +To: David Hartwig , + Bruce Momjian +From: Sbragion Denis +Subject: Re: [INTERFACES] Re: [HACKERS] changes in 6.4 +Cc: hannu@trust.ee, pgsql-interfaces@postgreSQL.org, hackers@postgreSQL.org +In-Reply-To: <35E9726E.C6E73049@bellatlantic.net> +References: <199808290344.XAA28089@candle.pha.pa.us> +Mime-Version: 1.0 +Content-Type: text/plain; charset="us-ascii" +Status: RO + +Hello, + +At 11.40 30/08/98 -0400, David Hartwig wrote: +>> Why is the system cnf'ifying the query. Because it wants to have a +>> list of qualifications that are AND'ed, so it can just pick the most +>> restrictive/cheapest, and evaluate that one first. + +Just a small question about all this optimizations stuff. I'm not a +database expert but I think we are talking about a NP-complete problem. +Could'nt we convert this optimization problem into another NP one that is +known to have a good solution ? For example for the traveling salesman +problem there's an alghoritm that provide a solution that's never more than +two times the optimal one an provides results that are *really* near the +optimal one most of the times. The simplex alghoritm may be another +example. I think that this kind of alghoritm would be better than a +collection ot tricks for special cases, and this tricks could be used +anyway when special cases are detected. Furthermore I also know that exists +a free program I used in the past that provides this kind of optimizations +for chip design. I don't remember the exact name of the program but I +remember it came from Berkeley university. Of course may be I'm totally +missing the point. + +Hope it helps ! + +Bye! + + Dr. Sbragion Denis + InfoTecna + Tel, Fax: +39 39 2324054 + URL: http://space.tin.it/internet/dsbragio + +From andreas.zeugswetter@telecom.at Mon Aug 31 06:31:13 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id GAA14231 + for ; Mon, 31 Aug 1998 06:31:12 -0400 (EDT) +Received: from gandalf.telecom.at (gandalf.telecom.at [194.118.26.84]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id GAA21099 for ; Mon, 31 Aug 1998 06:23:41 -0400 (EDT) +Received: from zeugswettera.user.lan.at (zeugswettera.user.lan.at [10.4.123.227]) by gandalf.telecom.at (A.B.C.Delta4/8.8.8) with SMTP id MAA38132; Mon, 31 Aug 1998 12:22:07 +0200 +Received: by zeugswettera.user.lan.at with Microsoft Mail + id <01BDD4DA.C7F5B690@zeugswettera.user.lan.at>; Mon, 31 Aug 1998 12:27:55 +0200 +Message-ID: <01BDD4DA.C7F5B690@zeugswettera.user.lan.at> +From: Andreas Zeugswetter +To: "'maillist@candle.pha.pa.us'" +Cc: "hackers@postgreSQL.org" +Subject: AW: [INTERFACES] Re: [HACKERS] changes in 6.4 +Date: Mon, 31 Aug 1998 12:22:05 +0200 +Encoding: 31 TEXT +Status: RO + + +>Another idea would be to do actual UNION queries: +> +> SELECT * FROM tab +> WHERE (x=3 and y=4) +> UNION +> SELECT * FROM tab +> WHERE (x=3 and y=5) +> UNION +> SELECT * FROM tab +> WHERE (x=3 and y=6) ... +> +>This would work well for tables with indexes, but for a sequential scan, +>you are doing a sequential scan for each UNION. + +The most important Application for this syntax will be M$ Access +because it uses this syntax to display x rows from a table in a particular +sort order. In this case x and y will be the primary key and therefore have a +unique index. So I think this special case should work good. + +The strategy could be something like: +iff x, y is a unique index + do the union access path +else + do something else +done + +I think hand written SQL can always be rewritten if it is not fast enough +using this syntax. + +Andreas + + +From owner-pgsql-patches@hub.org Tue Sep 1 02:01:10 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id CAA28687 + for ; Tue, 1 Sep 1998 02:01:06 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id BAA02180; Tue, 1 Sep 1998 01:48:43 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 01 Sep 1998 01:47:48 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id BAA02160 for pgsql-patches-outgoing; Tue, 1 Sep 1998 01:47:46 -0400 (EDT) +Received: from iconmail.bellatlantic.net (iconmail.bellatlantic.net [199.173.162.30]) by hub.org (8.8.8/8.7.5) with ESMTP id BAA02147 for ; Tue, 1 Sep 1998 01:47:42 -0400 (EDT) +Received: from bellatlantic.net (client196-126-3.bellatlantic.net [151.196.126.3]) + by iconmail.bellatlantic.net (IConNet Sendmail) with ESMTP id XAA27530 + for ; Mon, 31 Aug 1998 23:24:07 -0400 (EDT) +Message-ID: <35EB2B33.EBF1E9AA@bellatlantic.net> +Date: Mon, 31 Aug 1998 19:01:07 -0400 +From: David Hartwig +Organization: Insight Distribution Systems +X-Mailer: Mozilla 4.04 [en] (X11; I; Linux 2.0.29 i586) +MIME-Version: 1.0 +To: patches +Subject: [PATCHES] Interim AND/OR memory exaustion fix. +Content-Type: multipart/mixed; boundary="------------BEFD1E6DA78A2DC20B524E32" +Sender: owner-pgsql-patches@hub.org +Precedence: bulk +Status: ROr + +This is a multi-part message in MIME format. +--------------BEFD1E6DA78A2DC20B524E32 +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit + +I will be cleaning this up more before the Oct 1 deadline. + +--------------BEFD1E6DA78A2DC20B524E32 +Content-Type: text/plain; charset=us-ascii; name="keyset.patch" +Content-Transfer-Encoding: 7bit +Content-Disposition: inline; filename="keyset.patch" + +*** ./backend/commands/variable.c.orig Thu Jul 30 19:25:26 1998 +--- ./backend/commands/variable.c Mon Aug 31 17:23:32 1998 +*************** +*** 24,29 **** +--- 24,30 ---- + extern bool _use_geqo_; + extern int32 _use_geqo_rels_; + extern bool _use_right_sided_plans_; ++ extern bool _use_keyset_query_optimizer; + + /*-----------------------------------------------------------------------*/ + static const char * +*************** +*** 559,564 **** +--- 560,568 ---- + }, + #endif + { ++ "ksqo", parse_ksqo, show_ksqo, reset_ksqo ++ }, ++ { + NULL, NULL, NULL, NULL + } + }; +*************** +*** 611,615 **** +--- 615,663 ---- + + elog(NOTICE, "Unrecognized variable %s", name); + ++ return TRUE; ++ } ++ ++ ++ /*----------------------------------------------------------------------- ++ KSQO code will one day be unnecessary when the optimizer makes use of ++ indexes when multiple ORs are specified in the where clause. ++ See optimizer/prep/prepkeyset.c for more on this. ++ daveh@insightdist.com 6/16/98 ++ -----------------------------------------------------------------------*/ ++ bool ++ parse_ksqo(const char *value) ++ { ++ if (value == NULL) ++ { ++ reset_ksqo(); ++ return TRUE; ++ } ++ ++ if (strcasecmp(value, "on") == 0) ++ _use_keyset_query_optimizer = true; ++ else if (strcasecmp(value, "off") == 0) ++ _use_keyset_query_optimizer = false; ++ else ++ elog(ERROR, "Bad value for Key Set Query Optimizer (%s)", value); ++ ++ return TRUE; ++ } ++ ++ bool ++ show_ksqo() ++ { ++ ++ if (_use_keyset_query_optimizer) ++ elog(NOTICE, "Key Set Query Optimizer is ON"); ++ else ++ elog(NOTICE, "Key Set Query Optimizer is OFF"); ++ return TRUE; ++ } ++ ++ bool ++ reset_ksqo() ++ { ++ _use_keyset_query_optimizer = false; + return TRUE; + } +*** ./backend/optimizer/plan/planner.c.orig Sun Aug 30 04:28:02 1998 +--- ./backend/optimizer/plan/planner.c Mon Aug 31 17:23:32 1998 +*************** +*** 69,74 **** +--- 69,75 ---- + PlannerInitPlan = NULL; + PlannerPlanId = 0; + ++ transformKeySetQuery(parse); + result_plan = union_planner(parse); + + Assert(PlannerQueryLevel == 1); +*** ./backend/optimizer/prep/Makefile.orig Sun Apr 5 20:23:48 1998 +--- ./backend/optimizer/prep/Makefile Mon Aug 31 17:23:32 1998 +*************** +*** 13,19 **** + + CFLAGS += -I../.. + +! OBJS = prepqual.o preptlist.o prepunion.o + + # not ready yet: predmig.o xfunc.o + +--- 13,19 ---- + + CFLAGS += -I../.. + +! OBJS = prepqual.o preptlist.o prepunion.o prepkeyset.o + + # not ready yet: predmig.o xfunc.o + +*** ./backend/optimizer/prep/prepkeyset.c.orig Mon Aug 31 17:23:32 1998 +--- ./backend/optimizer/prep/prepkeyset.c Mon Aug 31 18:30:58 1998 +*************** +*** 0 **** +--- 1,213 ---- ++ /*------------------------------------------------------------------------- ++ * ++ * prepkeyset.c-- ++ * Special preperation for keyset queries. ++ * ++ * Copyright (c) 1994, Regents of the University of California ++ * ++ *------------------------------------------------------------------------- ++ */ ++ #include ++ #include ++ ++ #include "postgres.h" ++ #include "nodes/pg_list.h" ++ #include "nodes/parsenodes.h" ++ #include "utils/elog.h" ++ ++ #include "nodes/nodes.h" ++ #include "nodes/execnodes.h" ++ #include "nodes/plannodes.h" ++ #include "nodes/primnodes.h" ++ #include "nodes/relation.h" ++ ++ #include "catalog/pg_type.h" ++ #include "lib/stringinfo.h" ++ #include "optimizer/planmain.h" ++ /* ++ * Node_Copy-- ++ * a macro to simplify calling of copyObject on the specified field ++ */ ++ #define Node_Copy(from, newnode, field) newnode->field = copyObject(from->field) ++ ++ /***** DEBUG stuff ++ #define TABS {int i; printf("\n"); for (i = 0; igroupClause || ++ origNode->havingQual || ++ origNode->hasAggs || ++ origNode->utilityStmt || ++ origNode->unionClause || ++ origNode->unionall || ++ origNode->hasSubLinks || ++ origNode->commandType != CMD_SELECT) ++ return; ++ ++ /* Qualify single table query */ ++ ++ /* Qualify where clause */ ++ if ( ! inspectOrNode((Expr*)origNode->qual)) { ++ return; ++ } ++ ++ /* Copy essential elements into a union node */ ++ /* ++ elog(NOTICE, "OR_EXPR=%d, OP_EXPR=%d, AND_EXPR=%d", OR_EXPR, OP_EXPR, AND_EXPR); ++ elog(NOTICE, "T_List=%d, T_Expr=%d, T_Var=%d, T_Const=%d", T_List, T_Expr, T_Var, T_Const); ++ elog(NOTICE, "opType=%d", ((Expr*)origNode->qual)->opType); ++ */ ++ while (((Expr*)origNode->qual)->opType == OR_EXPR) { ++ Query *unionNode = makeNode(Query); ++ ++ /* Pull up Expr = */ ++ unionNode->qual = lsecond(((Expr*)origNode->qual)->args); ++ ++ /* Pull up balance of tree */ ++ origNode->qual = lfirst(((Expr*)origNode->qual)->args); ++ ++ /* ++ elog(NOTICE, "origNode: opType=%d, nodeTag=%d", ((Expr*)origNode->qual)->opType, nodeTag(origNode->qual)); ++ elog(NOTICE, "unionNode: opType=%d, nodeTag=%d", ((Expr*)unionNode->qual)->opType, nodeTag(unionNode->qual)); ++ */ ++ ++ unionNode->commandType = origNode->commandType; ++ unionNode->resultRelation = origNode->resultRelation; ++ unionNode->isPortal = origNode->isPortal; ++ unionNode->isBinary = origNode->isBinary; ++ ++ if (origNode->uniqueFlag) ++ unionNode->uniqueFlag = pstrdup(origNode->uniqueFlag); ++ ++ Node_Copy(origNode, unionNode, sortClause); ++ Node_Copy(origNode, unionNode, rtable); ++ Node_Copy(origNode, unionNode, targetList); ++ ++ origNode->unionClause = lappend(origNode->unionClause, unionNode); ++ } ++ return; ++ } ++ ++ ++ ++ ++ static int ++ inspectOrNode(Expr *expr) ++ { ++ int fr = 0, sr = 0; ++ Expr *firstExpr, *secondExpr; ++ ++ if ( ! (expr && nodeTag(expr) == T_Expr && expr->opType == OR_EXPR)) ++ return 0; ++ ++ firstExpr = lfirst(expr->args); ++ secondExpr = lsecond(expr->args); ++ if (nodeTag(firstExpr) != T_Expr || nodeTag(secondExpr) != T_Expr) ++ return 0; ++ ++ if (firstExpr->opType == OR_EXPR) ++ fr = inspectOrNode(firstExpr); ++ else if (firstExpr->opType == OP_EXPR) /* Need to make sure it is last */ ++ fr = inspectOpNode(firstExpr); ++ else if (firstExpr->opType == AND_EXPR) /* Need to make sure it is last */ ++ fr = inspectAndNode(firstExpr); ++ ++ ++ if (secondExpr->opType == AND_EXPR) ++ sr = inspectAndNode(secondExpr); ++ else if (secondExpr->opType == OP_EXPR) ++ sr = inspectOpNode(secondExpr); ++ ++ return (fr && sr); ++ } ++ ++ ++ static int ++ inspectAndNode(Expr *expr) ++ { ++ int fr = 0, sr = 0; ++ Expr *firstExpr, *secondExpr; ++ ++ if ( ! (expr && nodeTag(expr) == T_Expr && expr->opType == AND_EXPR)) ++ return 0; ++ ++ firstExpr = lfirst(expr->args); ++ secondExpr = lsecond(expr->args); ++ if (nodeTag(firstExpr) != T_Expr || nodeTag(secondExpr) != T_Expr) ++ return 0; ++ ++ if (firstExpr->opType == AND_EXPR) ++ fr = inspectAndNode(firstExpr); ++ else if (firstExpr->opType == OP_EXPR) ++ fr = inspectOpNode(firstExpr); ++ ++ if (secondExpr->opType == OP_EXPR) ++ sr = inspectOpNode(secondExpr); ++ ++ return (fr && sr); ++ } ++ ++ ++ static int ++ /****************************************************************** ++ * Return TRUE if T_Var = T_Const, else FALSE ++ * Actually it does not test for =. Need to do this! ++ ******************************************************************/ ++ inspectOpNode(Expr *expr) ++ { ++ Expr *firstExpr, *secondExpr; ++ ++ if (nodeTag(expr) != T_Expr || expr->opType != OP_EXPR) ++ return 0; ++ ++ firstExpr = lfirst(expr->args); ++ secondExpr = lsecond(expr->args); ++ return (firstExpr && secondExpr && nodeTag(firstExpr) == T_Var && nodeTag(secondExpr) == T_Const); ++ } +*** ./include/commands/variable.h.orig Thu Jul 30 19:27:05 1998 +--- ./include/commands/variable.h Mon Aug 31 17:23:32 1998 +*************** +*** 54,58 **** +--- 54,61 ---- + extern bool show_geqo(void); + extern bool reset_geqo(void); + extern bool parse_geqo(const char *); ++ extern bool show_ksqo(void); ++ extern bool reset_ksqo(void); ++ extern bool parse_ksqo(const char *); + + #endif /* VARIABLE_H */ +*** ./include/optimizer/planmain.h.orig Mon Aug 31 18:27:03 1998 +--- ./include/optimizer/planmain.h Mon Aug 31 18:26:04 1998 +*************** +*** 67,71 **** +--- 67,72 ---- + extern List *check_having_qual_for_aggs(Node *clause, + List *subplanTargetList, List *groupClause); + extern List *check_having_qual_for_vars(Node *clause, List *targetlist_so_far); ++ extern void transformKeySetQuery(Query *origNode); + + #endif /* PLANMAIN_H */ + +--------------BEFD1E6DA78A2DC20B524E32-- + + + +From daveh@insightdist.com Thu Sep 3 12:34:48 1998 +Received: from u1.abs.net (root@u1.abs.net [207.114.0.131]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id MAA07696 + for ; Thu, 3 Sep 1998 12:34:46 -0400 (EDT) +Received: from insightdist.com (nobody@localhost) + by u1.abs.net (8.9.0/8.9.0) with UUCP id MAA23590 + for maillist@candle.pha.pa.us; Thu, 3 Sep 1998 12:17:44 -0400 (EDT) +X-Authentication-Warning: u1.abs.net: nobody set sender to insightdist.com!daveh using -f +Received: from ceodev by insightdist.com (AIX 3.2/UCB 5.64/4.03) + id AA56436; Thu, 3 Sep 1998 11:51:24 -0400 +Received: from daveh by ceodev (AIX 4.1/UCB 5.64/4.03) + id AA45986; Thu, 3 Sep 1998 11:51:24 -0400 +Message-Id: <35EEBBEF.2158F68A@insightdist.com> +Date: Thu, 03 Sep 1998 11:55:28 -0400 +From: David Hartwig +Organization: Insight Distribution Systems +X-Mailer: Mozilla 4.05 [en] (Win95; I) +Mime-Version: 1.0 +To: Bruce Momjian +Cc: David Hartwig , pgsql-patches@postgreSQL.org +Subject: Re: [PATCHES] Interim AND/OR memory exaustion fix. +References: <199809030236.WAA22888@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: RO + + + +Bruce Momjian wrote: + +> > I will be cleaning this up more before the Oct 1 deadline. +> +> > *** ./backend/commands/variable.c.orig Thu Jul 30 19:25:26 1998 +> > --- ./backend/commands/variable.c Mon Aug 31 17:23:32 1998 +> +> Applied. Let's keep talking to see if we can come up with a nice +> general solution to this. +> + +Agreed. + +> I have been thinking, and the trouble case is a query that uses only one +> table, and had only "column = value" statements. I believe this can be +> easily identified and reworked somehow. +> + +If you are referring to the AND'less set of OR's, I do have plans to not let +that qualify since you have gotten the index scan working with OR's. + +I also think that the qualification process should be tightened up. For +example force the number of AND's to be the same in each OR grouping. And +have at least n OR's to qualify. We just need to head off the memory +exhaustion. + +> Your subtable idea may be a good one. +> + +This sounds like a 6.5 thing. I needed to stop the bleeding for 6.4. + + +From bga@mug.org Tue Sep 8 03:39:37 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id DAA06237 + for ; Tue, 8 Sep 1998 03:39:36 -0400 (EDT) +Received: from bgalli.mug.org (bajor.mug.org [207.158.132.1]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id DAA03648 for ; Tue, 8 Sep 1998 03:38:52 -0400 (EDT) +Received: from localhost (bga@localhost) by bgalli.mug.org (8.8.7/SCO5) with SMTP id DAA02895 for ; Tue, 8 Sep 1998 03:31:26 -0400 (EDT) +Message-Id: <199809080731.DAA02895@bgalli.mug.org> +X-Authentication-Warning: bgalli.mug.org: bga@localhost didn't use HELO protocol +X-Mailer: exmh version 2.0.2 2/24/98 +From: "Billy G. Allie" +Reply-To: "Billy G. Allie" +To: Bruce Momjian +Subject: Re: [HACKERS] flock patch breaks things here +In-reply-to: Your message of "Mon, 31 Aug 1998 00:36:34 EDT." + <199808310436.AAA07618@candle.pha.pa.us> +Mime-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Date: Tue, 08 Sep 1998 03:31:26 -0400 +Sender: bga@mug.org +Status: ROr + +Bruce Momjian writes: + +> I have been thinking about this. First, we can easily use fopen(r+) to +> check to see if the file exists, and if it does read the pid and do a +> kill -0 to see if it is running. If no one else does it, I will take it +> on. + +It is better to use open with the O_CREAT and O_EXCL set. If the file does not +exist it will be created and the PID can be written to it. If the file exists +then the call will fail, at which point it can be opened with fread, and the +PID it contains can be checked to see if it still exists with kill. The open +call has the added advantage that 'The check for the existence of the file and +the creation of the file if it does not exist is atomic with respect to other +processes executing open naming the same filename in the same directory with +O_EXCL and O_CREAT set.' [from the UnixAWare 7 man page, open(2)]. + +Also, you can't just delete the file, create it and write the your PID to it +and assume that you have the lock, you need to close the file, sleep some +small amount of time and then open and read the file to see if you still have +the lock. If you like, I can take this task on. + +Oh, the postmaster must clear the PID when it exits. + +> +> Second, where to put the pid file. There is reason to put in /tmp, +> because it will get cleared in a reboot, and because it is locking the +> port number 5432. There is also reason to put it in /data because you +> can't have more than one postmaster running on a single data directory. +> +> So, we really want to lock both places. If this is going to make it +> easier for people to run more than one postmaster, because it will +> prevent/warn administrators when they try and put two postmasters in the +> same data dir or port, I say create the pid lock files both places, and +> give the admin a clear description of what he is doing wrong in each +> case. + +IHMO, the pid should be put in the data directory. The reasoning that it will get cleared in a reboot is not sufficent since the logic used to create the PID file will delete it if the PID it contains is not a running process. Besides, I have used systems where /tmp was not cleared out on a re-boot (for various reasons). Also, I would rather have a script that explicitly removes the PID locking file at system statup (if it exists), in which case, it doesn't matter where it resides. +-- +____ | Billy G. Allie | Domain....: Bill.Allie@mug.org +| /| | 7436 Hartwell | Compuserve: 76337,2061 +|-/-|----- | Dearborn, MI 48126| MSN.......: B_G_Allie@email.msn.com +|/ |LLIE | (313) 582-1540 | + + + +From owner-pgsql-general@hub.org Thu Oct 1 14:00:57 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id OAA12443 + for ; Thu, 1 Oct 1998 14:00:56 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA07930 for ; Thu, 1 Oct 1998 13:57:47 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA26913; + Thu, 1 Oct 1998 13:56:29 -0400 (EDT) + (envelope-from owner-pgsql-general@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 01 Oct 1998 13:55:56 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA26856 + for pgsql-general-outgoing; Thu, 1 Oct 1998 13:55:54 -0400 (EDT) + (envelope-from owner-pgsql-general@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-general@postgreSQL.org using -f +Received: from mail.utexas.edu (wb3-a.mail.utexas.edu [128.83.126.138]) + by hub.org (8.8.8/8.8.8) with SMTP id NAA26840 + for ; Thu, 1 Oct 1998 13:55:49 -0400 (EDT) + (envelope-from taral@mail.utexas.edu) +Received: (qmail 1198 invoked by uid 0); 1 Oct 1998 17:55:40 -0000 +Received: from dial-24-13.ots.utexas.edu (HELO taral) (128.83.128.157) + by umbs-smtp-3 with SMTP; 1 Oct 1998 17:55:40 -0000 +From: "Taral" +To: +Subject: [GENERAL] CNF vs DNF +Date: Thu, 1 Oct 1998 12:55:39 -0500 +Message-ID: <000001bded64$b34b2200$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +In-Reply-To: +Importance: Normal +Sender: owner-pgsql-general@postgreSQL.org +Precedence: bulk +Status: RO + +> select * from aa where (bb = 2 and ff = 3) or (bb = 4 and ff = 5); + +I've been told that the system restructures these in CNF (conjunctive normal +form)... i.e. the above query turns into: + +select * from aa where (bb = 2 or bb = 4) and (ff = 3 or bb = 4) and (bb = 2 +or ff = 5) and (ff = 3 or ff = 5); + +Much longer and much less efficient, AFAICT. Isn't it more efficient to do a +union of many queries (DNF) than an intersection of many subqueries (CNF)? +Certainly remembering the subqueries takes less memory... Also, queries +already in DNF are probably more common than queries in CNF, requiring less +rewrite. + +Can someone clarify this? + +Taral + + + +From taral@mail.utexas.edu Fri Oct 2 01:35:42 1998 +Received: from mail.utexas.edu (wb1-a.mail.utexas.edu [128.83.126.134]) + by candle.pha.pa.us (8.9.0/8.9.0) with SMTP id BAA28231 + for ; Fri, 2 Oct 1998 01:35:27 -0400 (EDT) +Received: (qmail 16318 invoked by uid 0); 2 Oct 1998 05:35:13 -0000 +Received: from dial-42-8.ots.utexas.edu (HELO taral) (128.83.111.216) + by umbs-smtp-1 with SMTP; 2 Oct 1998 05:35:13 -0000 +From: "Taral" +To: "Bruce Momjian" +Cc: +Subject: RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Fri, 2 Oct 1998 00:35:12 -0500 +Message-ID: <000001bdedc6$6cf75d20$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-Reply-To: <199810020218.WAA23299@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Status: ROr + +> It currently convert to CNF so it can select the most restrictive +> restriction and join, and use those first. However, the CNF conversion +> is a memory exploder for some queries, and we certainly need to have +> another method to split up those queries into UNIONS. I think we need +> to code to identify those queries capable of being converted to UNIONS, +> and do that before the query gets to the CNF section. That would be +> great, and David Hartwig has implemented a limited capability of doing +> this, but we really need a general routine to do this with 100% +> reliability. + +Well, if you're talking about a routine to generate a heuristic for CNF vs. +DNF, it is possible to precalculate the query sizes for CNF and DNF +rewrites... + +For conversion to CNF: + +At every node: + +if nodeType = AND then f(node) = f(left) + f(right) +if nodeType = OR then f(node) = f(left) * f(right) + +f(root) = a reasonably (but not wonderful) metric + +For DNF just switch AND and OR in the above. You may want to compute both +metrics and compare... take the smaller one and use that path. + +How to deal with other operators depends on their implementation... + +Taral + + +From taral@mail.utexas.edu Fri Oct 2 12:48:27 1998 +Received: from mail.utexas.edu (wb4-a.mail.utexas.edu [128.83.126.140]) + by candle.pha.pa.us (8.9.0/8.9.0) with SMTP id MAA11438 + for ; Fri, 2 Oct 1998 12:48:25 -0400 (EDT) +Received: (qmail 15628 invoked by uid 0); 2 Oct 1998 16:47:50 -0000 +Received: from dial-42-8.ots.utexas.edu (HELO taral) (128.83.111.216) + by umbs-smtp-4 with SMTP; 2 Oct 1998 16:47:50 -0000 +From: "Taral" +To: "Bruce Momjian" +Cc: +Subject: RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Fri, 2 Oct 1998 11:47:48 -0500 +Message-ID: <000301bdee24$63308740$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-reply-to: <199810021640.MAA10925@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Status: RO + +> > Create a temporary oid hash? (for each table selected on, I guess) +> +> What I did with indexes was to run the previous OR clause index +> restrictions through the qualification code, and make sure it failed, +> but I am not sure how that is going to work with a more complex WHERE +> clause. Perhaps I need to restrict this to just simple cases of +> constants, which are easy to pick out an run through. Doing this with +> joins would be very hard, I think. + +Actually, I was thinking more of an index of returned rows... After each +subquery, the backend would check each row to see if it was already in the +index... Simple duplicate check, in other words. Of course, I don't know how +well this would behave with large tables being returned... + +Anyone else have some ideas they want to throw in? + +Taral + + +From taral@mail.utexas.edu Fri Oct 2 17:13:01 1998 +Received: from mail.utexas.edu (wb1-a.mail.utexas.edu [128.83.126.134]) + by candle.pha.pa.us (8.9.0/8.9.0) with SMTP id RAA20838 + for ; Fri, 2 Oct 1998 17:12:27 -0400 (EDT) +Received: (qmail 17418 invoked by uid 0); 2 Oct 1998 21:12:19 -0000 +Received: from dial-46-30.ots.utexas.edu (HELO taral) (128.83.112.158) + by umbs-smtp-1 with SMTP; 2 Oct 1998 21:12:19 -0000 +From: "Taral" +To: "Bruce Momjian" , +Cc: +Subject: RE: [HACKERS] RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Fri, 2 Oct 1998 16:12:19 -0500 +Message-ID: <000001bdee49$56c7cd40$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-reply-to: <199810021758.NAA15524@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Status: ROr + +> Another idea is that we rewrite queries such as: +> +> SELECT * +> FROM tab +> WHERE (a=1 AND b=2 AND c=3) OR +> (a=1 AND b=2 AND c=4) OR +> (a=1 AND b=2 AND c=5) OR +> (a=1 AND b=2 AND c=6) +> +> into: +> +> SELECT * +> FROM tab +> WHERE (a=1 AND b=2) AND (c=3 OR c=4 OR c=5 OR c=6) + +Very nice, but that's like trying to code factorization of numbers... not +pretty, and very CPU intensive on complex queries... + +Taral + + +From taral@mail.utexas.edu Fri Oct 2 17:49:59 1998 +Received: from mail.utexas.edu (wb2-a.mail.utexas.edu [128.83.126.136]) + by candle.pha.pa.us (8.9.0/8.9.0) with SMTP id RAA21488 + for ; Fri, 2 Oct 1998 17:49:52 -0400 (EDT) +Received: (qmail 23729 invoked by uid 0); 2 Oct 1998 21:49:27 -0000 +Received: from dial-2-6.ots.utexas.edu (HELO taral) (128.83.204.22) + by umbs-smtp-2 with SMTP; 2 Oct 1998 21:49:27 -0000 +From: "Taral" +To: "Bruce Momjian" +Cc: , +Subject: RE: [HACKERS] RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Fri, 2 Oct 1998 16:49:26 -0500 +Message-ID: <000001bdee4e$86688b20$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-Reply-To: <199810022139.RAA21082@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Status: ROr + +> > Very nice, but that's like trying to code factorization of +> numbers... not +> > pretty, and very CPU intensive on complex queries... +> +> Yes, but how large are the WHERE clauses going to be? Considering the +> cost of cnfify() and UNION, it seems like a clear win. Is it general +> enough to solve our problems? + +Could be... the examples I received where the cnfify() was really bad were +cases where the query was submitted alredy in DNF... and where the UNION was +a simple one. However, I don't know of any algorithms for generic +simplification of logical constraints. One problem is resolution/selection +of factors: + +SELECT * FROM a WHERE (a = 1 AND b = 2 AND c = 3) OR (a = 4 AND b = 2 AND c += 3) OR (a = 1 AND b = 5 AND c = 3) OR (a = 1 AND b = 2 AND c = 6); + +Try that on for size. You can understand why that code gets ugly, fast. +Somebody could try coding it, but it's not a clear win to me. + +My original heuristic was missing one thing: "Where the heuristic fails to +process or decide, default to CNF." Since that's the current behavior, we're +less likely to break things. + +Taral + + +From owner-pgsql-hackers@hub.org Fri Oct 2 19:28:09 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA23341 + for ; Fri, 2 Oct 1998 19:28:08 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id SAA18003 for ; Fri, 2 Oct 1998 18:21:37 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id SAA01250; + Fri, 2 Oct 1998 18:08:02 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 02 Oct 1998 18:04:37 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id SAA00847 + for pgsql-hackers-outgoing; Fri, 2 Oct 1998 18:04:35 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from mail.utexas.edu (wb2-a.mail.utexas.edu [128.83.126.136]) + by hub.org (8.8.8/8.8.8) with SMTP id SAA00806 + for ; Fri, 2 Oct 1998 18:04:26 -0400 (EDT) + (envelope-from taral@mail.utexas.edu) +Received: (qmail 29662 invoked by uid 0); 2 Oct 1998 22:04:25 -0000 +Received: from dial-2-6.ots.utexas.edu (HELO taral) (128.83.204.22) + by umbs-smtp-2 with SMTP; 2 Oct 1998 22:04:25 -0000 +From: "Taral" +To: "Bruce Momjian" +Cc: , +Subject: RE: [HACKERS] RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Fri, 2 Oct 1998 17:04:24 -0500 +Message-ID: <000201bdee50$9d9c4320$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-Reply-To: <199810022157.RAA21769@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> How do we do that with UNION, and return the right rows. Seems the +> _join_ happending multiple times would be much worse than the factoring. + +Ok... We have two problems: + +1) DNF for unjoined queries. +2) Factorization for the rest. + +I have some solutions for (1). Not for (2). Remember that unjoined queries +are quite common. :) + +For (1), we can always try to parallel the multiple queries... especially in +the case where a sequential search is required. + +Taral + + + +From owner-pgsql-hackers@hub.org Sat Oct 3 23:32:35 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id XAA06644 + for ; Sat, 3 Oct 1998 23:31:13 -0400 (EDT) +Received: from hub.org (root@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id XAA26912 for ; Sat, 3 Oct 1998 23:14:01 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id WAA04407; + Sat, 3 Oct 1998 22:07:05 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sat, 03 Oct 1998 22:02:00 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id WAA04010 + for pgsql-hackers-outgoing; Sat, 3 Oct 1998 22:01:59 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from candle.pha.pa.us (maillist@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.8.8/8.8.8) with ESMTP id WAA03968 + for ; Sat, 3 Oct 1998 22:00:37 -0400 (EDT) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id VAA04640; + Sat, 3 Oct 1998 21:57:30 -0400 (EDT) +From: Bruce Momjian +Message-Id: <199810040157.VAA04640@candle.pha.pa.us> +Subject: Re: [HACKERS] RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +In-Reply-To: <000201bdee50$9d9c4320$3b291f0a@taral> from Taral at "Oct 2, 1998 5: 4:24 pm" +To: taral@mail.utexas.edu (Taral) +Date: Sat, 3 Oct 1998 21:57:30 -0400 (EDT) +Cc: jwieck@debis.com, hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + + +I have another idea. + +When we cnfify, this: + + (A AND B) OR (C AND D) + +becomes + + (A OR C) AND (A OR D) AND (B OR C) AND (B OR D) + +however if A and C are identical, this could become: + + (A OR A) AND (A OR D) AND (B OR A) AND (B OR D) + +and A OR A is A: + + A AND (A OR D) AND (B OR A) AND (B OR D) + +and since we are now saying A has to be true, we can remove OR's with A: + + A AND (B OR D) + +Much smaller, and a big win for queries like: + + SELECT * + FROM tab + WHERE (a=1 AND b=2) OR + (a=1 AND b=3) + +This becomes: + + (a=1) AND (b=2 OR b=3) + +which is accurate, and uses our OR indexing. + +Seems I could code cnfify() to look for identical qualifications in two +joined OR clauses and remove the duplicates. + +Sound like big win, and fairly easy and inexpensive in processing time. + +Comments? + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + + +From taral@mail.utexas.edu Sat Oct 3 22:43:41 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id WAA05961 + for ; Sat, 3 Oct 1998 22:42:18 -0400 (EDT) +Received: from mail.utexas.edu (wb2-a.mail.utexas.edu [128.83.126.136]) by renoir.op.net (o1/$Revision: 1.1 $) with SMTP id WAA25111 for ; Sat, 3 Oct 1998 22:27:34 -0400 (EDT) +Received: (qmail 25622 invoked by uid 0); 4 Oct 1998 02:26:21 -0000 +Received: from dial-42-9.ots.utexas.edu (HELO taral) (128.83.111.217) + by umbs-smtp-2 with SMTP; 4 Oct 1998 02:26:21 -0000 +From: "Taral" +To: "Bruce Momjian" +Cc: , +Subject: RE: [HACKERS] RE: [GENERAL] Long update query ? (also Re: [GENERAL] CNF vs. DNF) +Date: Sat, 3 Oct 1998 21:26:20 -0500 +Message-ID: <000501bdef3e$5f5293a0$3b291f0a@taral> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +Importance: Normal +In-Reply-To: <199810040157.VAA04640@candle.pha.pa.us> +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3155.0 +Status: ROr + +> however if A and C are identical, this could become: +> +> (A OR A) AND (A OR D) AND (B OR A) AND (B OR D) +> +> and A OR A is A: +> +> A AND (A OR D) AND (B OR A) AND (B OR D) +> +> and since we are now saying A has to be true, we can remove OR's with A: +> +> A AND (B OR D) + +Very nice... and you could do that after each iteration of the rewrite, +preventing the size from getting too big. :) + +I have a symbolic expression tree evaluator that would be perfect for +this... I'll see if I can't adapt it. + +Can someone mail me the structures for expression trees? I don't want to +have to excise them from the source. Please? + +Taral + + +From daveh@insightdist.com Mon Nov 9 13:31:07 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA00997 + for ; Mon, 9 Nov 1998 13:31:00 -0500 (EST) +Received: from u1.abs.net (root@u1.abs.net [207.114.0.131]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA26657 for ; Mon, 9 Nov 1998 13:10:14 -0500 (EST) +Received: from insightdist.com (nobody@localhost) + by u1.abs.net (8.9.0/8.9.0) with UUCP id MAA17710 + for maillist@candle.pha.pa.us; Mon, 9 Nov 1998 12:52:05 -0500 (EST) +X-Authentication-Warning: u1.abs.net: nobody set sender to insightdist.com!daveh using -f +Received: from ceodev by insightdist.com (AIX 3.2/UCB 5.64/4.03) + id AA43498; Mon, 9 Nov 1998 12:38:24 -0500 +Received: from daveh by ceodev (AIX 4.1/UCB 5.64/4.03) + id AA54446; Mon, 9 Nov 1998 12:38:23 -0500 +Message-Id: <3647296F.6F7FDDD2@insightdist.com> +Date: Mon, 09 Nov 1998 12:42:07 -0500 +From: David Hartwig +Organization: Insight Distribution Systems +X-Mailer: Mozilla 4.5 [en] (Win98; I) +X-Accept-Language: en +Mime-Version: 1.0 +To: Bob Kruger , + Bruce Momjian +Cc: pgsql-general@postgreSQL.org, Byron Nikolaidis +Subject: Re: [GENERAL] Incrementing a Serial Field +References: <3.0.5.32.19981109110757.0082c950@mindspring.com> +Content-Type: multipart/mixed; + boundary="------------3D3EE7F67DFC542D3928BB7E" +Status: ROr + +This is a multi-part message in MIME format. +--------------3D3EE7F67DFC542D3928BB7E +Content-Type: multipart/alternative; + boundary="------------43E2CC34278FA08EFC9E0611" + + +--------------43E2CC34278FA08EFC9E0611 +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit + + + +Bob Kruger wrote: + +> The second question is that I noticed the ODBC bug (feature?) when linking +> Postgres to MS Access still exists. This bug occurs when linking a MS +> Access table to a Postgres table, and identifying more than one field as +> the unique record identifier. This makes Postgres run until it exhausts +> all available memory. Does anyone know a way around this? Enabling read +> only ODBC is a feature I would like to make available, but I do not want +> the possibility of postgres crashing because of an error on the part of a +> MS Access user. +> +> BTW - Having capability to be linked to an Access database is not an +> option. The current project I am working on calls for that, so it is a +> necessary evil that I hav to live with. +> + +In the driver connection settings add the following line. + + SET ksql TO 'on'; + +Stands for: keyset query optimization. This is not considered a final +solution. As such, it is undocumented. Some time in the next day or so, we +will be releasing a version of the driver which will automatically SET ksqo. + +You will most likely be satisfied with the results. One problem with this +solution, however, is that it does not work if you have any (some kinds of?) +arrays in the table you are browsing. This is a sideffect of the rewrite to a +UNION which performs an internal sort unique. + +Also, if you are using row versioning you may need to overload some operators +for xid and int4. I have included a script that will take care of this. + +Bruce, can I get these operators hardcoded into 6.4.1- assuming there will be +one. The operators necessitated by the UNION sideffects. + + +--------------43E2CC34278FA08EFC9E0611 +Content-Type: text/html; charset=us-ascii +Content-Transfer-Encoding: 7bit + + + +  +

Bob Kruger wrote: +

The second question is that I noticed the ODBC bug +(feature?) when linking +
Postgres to MS Access still exists.  This bug occurs when linking +a MS +
Access table to a Postgres table, and identifying more than one field +as +
the unique record identifier.  This makes Postgres run until it +exhausts +
all available memory.  Does anyone know a way around this?  +Enabling read +
only ODBC is a feature I would like to make available, but I do not +want +
the possibility of postgres crashing because of an error on the part +of a +
MS Access user. +

BTW - Having capability to be linked to an Access database is not an +
option.  The current project I am working on calls for that, so +it is a +
necessary evil that I hav to live with. +
 

+In the driver connection settings add the following line. +

    SET ksql TO 'on'; +

Stands for: keyset query optimization.  This is not considered +a final solution.  As such, it is undocumented.   Some time +in the next day or so, we will be releasing a version of the driver which +will automatically SET ksqo. +

You will most likely be satisfied with the results.   One +problem with this solution, however,  is that it does not work if +you have any (some kinds of?) arrays in the table you are browsing.   +This is a sideffect of the rewrite to a UNION which performs an internal +sort unique. +

Also, if you are using row versioning you may need to overload some +operators for xid and int4.  I have included a script that will take +care of this. +

Bruce, can I get these operators hardcoded into 6.4.1- assuming there +will be one.   The operators  necessitated by the UNION +sideffects. +
  + +--------------43E2CC34278FA08EFC9E0611-- + +--------------3D3EE7F67DFC542D3928BB7E +Content-Type: text/plain; charset=us-ascii; + name="xidint4.sql" +Content-Transfer-Encoding: 7bit +Content-Disposition: inline; + filename="xidint4.sql" + +-- Insight Distribution Systems - System V - Apr 1998 +-- @(#)xidint4.sql 1.2 :/sccs/sql/extend/s.xidint4.sql 10/2/98 13:40:19" + +create function int4eq(xid,int4) + returns bool + as '' + language 'internal'; + +create operator = ( + leftarg=xid, + rightarg=int4, + procedure=int4eq, + commutator='=', + negator='<>', + restrict=eqsel, + join=eqjoinsel + ); + +create function int4lt(xid,xid) + returns bool + as '' + language 'internal'; + +create operator < ( + leftarg=xid, + rightarg=xid, + procedure=int4lt, + commutator='=', + negator='<>', + restrict=eqsel, + join=eqjoinsel + ); + + + +--------------3D3EE7F67DFC542D3928BB7E-- + + diff --git a/doc/TODO.detail/flock b/doc/TODO.detail/flock new file mode 100644 index 0000000000..f25270ac68 --- /dev/null +++ b/doc/TODO.detail/flock @@ -0,0 +1,351 @@ +From tgl@sss.pgh.pa.us Sun Aug 30 11:25:23 1998 +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id LAA12607 + for ; Sun, 30 Aug 1998 11:25:20 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id LAA15788; + Sun, 30 Aug 1998 11:23:38 -0400 (EDT) +To: Bruce Momjian +cc: dz@cs.unitn.it (Massimo Dal Zotto), hackers@postgreSQL.org +Subject: Re: [HACKERS] flock patch breaks things here +In-reply-to: Your message of Sun, 30 Aug 1998 08:19:52 -0400 (EDT) + <199808301219.IAA08821@candle.pha.pa.us> +Date: Sun, 30 Aug 1998 11:23:38 -0400 +Message-ID: <15786.904490618@sss.pgh.pa.us> +From: Tom Lane +Status: RO + +Bruce Momjian writes: +> Can't we just have configure check for flock(). Another idea is to +> create a 'pid' file in the pgsql/data/base directory, and do a kill -0 +> to see if it is stil running before removing the lock. + +The latter approach is what I was going to suggest. Writing a pid file +would be a fine idea anyway --- for one thing, it makes it a lot easier +to write a "kill the postmaster" script. Given that the postmaster +should write a pid file, a new postmaster should look for an existing +pid file, and try to do a kill(pid, 0) on the number contained therein. +If this doesn't return an error, then you figure there is already a +postmaster running, complain, and exit. Otherwise you figure you is it, +(re)write the pid file and away you go. Then pqcomm.c can just +unconditionally delete any old file that's in the way of making the +pipe. + +The pidfile checking and creation probably ought to go in postmaster.c, +not down inside pqcomm.c. I never liked the fact that a critical +interlock function was being done by a low-level library that one might +not even want to invoke (if all your clients are using TCP, opening up +the Unix-domain socket is a waste of time, no?). + +BTW, there is another problem with relying on flock on the socket file +for this purpose: it opens up a hole for a denial-of-service attack. +Anyone who can write the file can flock it. (We already had a problem +with DOS via creating a dummy file at /tmp/.s.PGSQL.5432, but it would +be harder to spot the culprit with an flock-based interference.) + + regards, tom lane + +From owner-pgsql-hackers@hub.org Sun Aug 30 12:27:41 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id MAA12976 + for ; Sun, 30 Aug 1998 12:27:37 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id MAA09234; Sun, 30 Aug 1998 12:24:51 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 30 Aug 1998 12:23:26 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id MAA09167 for pgsql-hackers-outgoing; Sun, 30 Aug 1998 12:23:25 -0400 (EDT) +Received: from mambo.cs.unitn.it (mambo.cs.unitn.it [193.205.199.204]) by hub.org (8.8.8/8.7.5) with SMTP id MAA09150 for ; Sun, 30 Aug 1998 12:23:08 -0400 (EDT) +Received: from boogie.cs.unitn.it (dz@boogie [193.205.199.79]) by mambo.cs.unitn.it (8.6.12/8.6.12) with ESMTP id SAA29572; Sun, 30 Aug 1998 18:21:42 +0200 +Received: (from dz@localhost) by boogie.cs.unitn.it (8.8.5/8.6.9) id SAA05993; Sun, 30 Aug 1998 18:21:41 +0200 +From: Massimo Dal Zotto +Message-Id: <199808301621.SAA05993@boogie.cs.unitn.it> +Subject: Re: [HACKERS] flock patch breaks things here +To: hackers@postgreSQL.org (PostgreSQL Hackers) +Date: Sun, 30 Aug 1998 18:21:41 +0200 (MET DST) +Cc: tgl@sss.pgh.pa.us (Tom Lane) +In-Reply-To: <15786.904490618@sss.pgh.pa.us> from "Tom Lane" at Aug 30, 98 11:23:38 am +X-Mailer: ELM [version 2.4 PL24 ME4] +MIME-Version: 1.0 +Content-Type: text/plain; charset=iso-8859-1 +Content-Transfer-Encoding: 8bit +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: ROr + +> +> Bruce Momjian writes: +> > Can't we just have configure check for flock(). Another idea is to +> > create a 'pid' file in the pgsql/data/base directory, and do a kill -0 +> > to see if it is stil running before removing the lock. +> +> The latter approach is what I was going to suggest. Writing a pid file +> would be a fine idea anyway --- for one thing, it makes it a lot easier +> to write a "kill the postmaster" script. Given that the postmaster +> should write a pid file, a new postmaster should look for an existing +> pid file, and try to do a kill(pid, 0) on the number contained therein. +> If this doesn't return an error, then you figure there is already a +> postmaster running, complain, and exit. Otherwise you figure you is it, +> (re)write the pid file and away you go. Then pqcomm.c can just +> unconditionally delete any old file that's in the way of making the +> pipe. +> +> The pidfile checking and creation probably ought to go in postmaster.c, +> not down inside pqcomm.c. I never liked the fact that a critical +> interlock function was being done by a low-level library that one might +> not even want to invoke (if all your clients are using TCP, opening up +> the Unix-domain socket is a waste of time, no?). +> +> BTW, there is another problem with relying on flock on the socket file +> for this purpose: it opens up a hole for a denial-of-service attack. +> Anyone who can write the file can flock it. (We already had a problem +> with DOS via creating a dummy file at /tmp/.s.PGSQL.5432, but it would +> be harder to spot the culprit with an flock-based interference.) + +This came to my mind, but I didn't think this would have happened so +quickly. In my opinion the socket and the pidfile should be created in a +directory owned by postgres, for example /tmp/.Pgsql-unix, like does X. + +-- +Massimo Dal Zotto + ++----------------------------------------------------------------------+ +| Massimo Dal Zotto email: dz@cs.unitn.it | +| Via Marconi, 141 phone: ++39-461-534251 | +| 38057 Pergine Valsugana (TN) www: http://www.cs.unitn.it/~dz/ | +| Italy pgp: finger dz@tango.cs.unitn.it | ++----------------------------------------------------------------------+ + + +From owner-pgsql-hackers@hub.org Sun Aug 30 13:01:10 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id NAA13785 + for ; Sun, 30 Aug 1998 13:01:09 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA29386 for ; Sun, 30 Aug 1998 12:58:24 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id MAA11406; Sun, 30 Aug 1998 12:54:48 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 30 Aug 1998 12:52:22 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id MAA11310 for pgsql-hackers-outgoing; Sun, 30 Aug 1998 12:52:20 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by hub.org (8.8.8/8.7.5) with ESMTP id MAA11296 for ; Sun, 30 Aug 1998 12:52:13 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id MAA16094; + Sun, 30 Aug 1998 12:50:55 -0400 (EDT) +To: Massimo Dal Zotto +cc: hackers@postgreSQL.org (PostgreSQL Hackers) +Subject: Re: [HACKERS] flock patch breaks things here +In-reply-to: Your message of Sun, 30 Aug 1998 18:21:41 +0200 (MET DST) + <199808301621.SAA05993@boogie.cs.unitn.it> +Date: Sun, 30 Aug 1998 12:50:55 -0400 +Message-ID: <16092.904495855@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + +Massimo Dal Zotto writes: +> In my opinion the socket and the pidfile should be created in a +> directory owned by postgres, for example /tmp/.Pgsql-unix, like does X. + +The pidfile belongs at the top level of the database directory (eg, +/usr/local/pgsql/data/postmaster.pid), because what it actually +represents is that there is a postmaster running *for that database +group*. + +If you want to support multiple database sets on one machine (which I +do), then the interlock has to be per database directory. Putting the +pidfile into a common directory would mean we'd have to invent some +kind of pidfile naming convention to keep multiple postmasters from +tromping on each other. This is unnecessarily complex. + +I agree with you that putting the socket file into a less easily munged +directory than /tmp would be a good idea for security. But that's a +separate issue. On machines that understand stickybits for directories, +the security hole is not really very big. + +At this point, the fact that /tmp/.s.PGSQL.port# is the socket path is +effectively a version-independent aspect of the FE/BE protocol, and so +we can't change it without breaking old applications. I'm not sure that +that's worth the security improvement. + +What I'd like to see someday is a postmaster command line switch to tell +it to use *only* TCP connections and not create a Unix socket at all. +That hasn't been possible so far, because we were relying on the socket +file to provide a safety interlock against starting multiple +postmasters. But an interlock using a pidfile would be much better. +(Look around; *every* other Unix daemon I know of that wants to ensure +that there's only one of it uses a pidfile interlock. Not file locking. +There's a reason why that's the well-trodden path.) + + regards, tom lane + + +From owner-pgsql-hackers@hub.org Sun Aug 30 15:31:13 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id PAA15275 + for ; Sun, 30 Aug 1998 15:31:11 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id PAA22194; Sun, 30 Aug 1998 15:27:20 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 30 Aug 1998 15:23:58 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id PAA21800 for pgsql-hackers-outgoing; Sun, 30 Aug 1998 15:23:57 -0400 (EDT) +Received: from thelab.hub.org (nat0118.mpoweredpc.net [142.177.188.118]) by hub.org (8.8.8/8.7.5) with ESMTP id PAA21696 for ; Sun, 30 Aug 1998 15:22:51 -0400 (EDT) +Received: from localhost (scrappy@localhost) + by thelab.hub.org (8.9.1/8.8.8) with SMTP id QAA18542; + Sun, 30 Aug 1998 16:21:29 -0300 (ADT) + (envelope-from scrappy@hub.org) +X-Authentication-Warning: thelab.hub.org: scrappy owned process doing -bs +Date: Sun, 30 Aug 1998 16:21:28 -0300 (ADT) +From: The Hermit Hacker +To: Tom Lane +cc: Massimo Dal Zotto , + PostgreSQL Hackers +Subject: Re: [HACKERS] flock patch breaks things here +In-Reply-To: <16092.904495855@sss.pgh.pa.us> +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + +On Sun, 30 Aug 1998, Tom Lane wrote: + +> Massimo Dal Zotto writes: +> > In my opinion the socket and the pidfile should be created in a +> > directory owned by postgres, for example /tmp/.Pgsql-unix, like does X. +> +> The pidfile belongs at the top level of the database directory (eg, +> /usr/local/pgsql/data/postmaster.pid), because what it actually +> represents is that there is a postmaster running *for that database +> group*. + + I have to agree with this one...but then it also negates the +argument about the flock() DoS...*grin* + + BTW...I like the kill(pid,0) solution myself, primarily because it +is, i think, the most portable solution. + + I would not consider a patch to remove the flock() solution and +replace it with the kill(pid,0) solution a new feature, just an +improvement of an existing one...either way, moving the pid file (or +socket, for that matter) from /tmp should be listed as a security related +requirement for v6.4 :) + +Marc G. Fournier +Systems Administrator @ hub.org +primary: scrappy@hub.org secondary: scrappy@{freebsd|postgresql}.org + + + +From owner-pgsql-hackers@hub.org Sun Aug 30 22:41:10 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id WAA01526 + for ; Sun, 30 Aug 1998 22:41:08 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id WAA29298; Sun, 30 Aug 1998 22:38:18 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 30 Aug 1998 22:35:05 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id WAA29203 for pgsql-hackers-outgoing; Sun, 30 Aug 1998 22:35:03 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by hub.org (8.8.8/8.7.5) with ESMTP id WAA29017 for ; Sun, 30 Aug 1998 22:34:55 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id WAA20075; + Sun, 30 Aug 1998 22:34:41 -0400 (EDT) +To: The Hermit Hacker +cc: PostgreSQL Hackers +Subject: Re: [HACKERS] flock patch breaks things here +In-reply-to: Your message of Sun, 30 Aug 1998 16:21:28 -0300 (ADT) + +Date: Sun, 30 Aug 1998 22:34:40 -0400 +Message-ID: <20073.904530880@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: ROr + +The Hermit Hacker writes: +> either way, moving the pid file (or +> socket, for that matter) from /tmp should be listed as a security related +> requirement for v6.4 :) + +Huh? There is no pid file being generated in /tmp (or anywhere else) +at the moment. If we do add one, it should not go into /tmp for the +reasons I gave before. + +Where the Unix-domain socket file lives is an entirely separate issue. + +If we move the socket out of /tmp then we have just kicked away all the +work we did to preserve backwards compatibility of the FE/BE protocol +with existing clients. Being able to talk to a 1.0 client isn't much +good if you aren't listening where he's going to try to contact you. +So I think I have to vote in favor of leaving the socket where it is. + + regards, tom lane + + +From owner-pgsql-hackers@hub.org Mon Aug 31 11:31:19 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id LAA21195 + for ; Mon, 31 Aug 1998 11:31:13 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id LAA06827 for ; Mon, 31 Aug 1998 11:17:41 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id LAA24792; Mon, 31 Aug 1998 11:12:18 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 31 Aug 1998 11:10:31 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id LAA24742 for pgsql-hackers-outgoing; Mon, 31 Aug 1998 11:10:29 -0400 (EDT) +Received: from trillium.nmsu.edu (trillium.NMSU.Edu [128.123.5.15]) by hub.org (8.8.8/8.7.5) with ESMTP id LAA24725 for ; Mon, 31 Aug 1998 11:10:22 -0400 (EDT) +Received: (from brook@localhost) + by trillium.nmsu.edu (8.8.8/8.8.8) id JAA03282; + Mon, 31 Aug 1998 09:09:01 -0600 (MDT) +Date: Mon, 31 Aug 1998 09:09:01 -0600 (MDT) +Message-Id: <199808311509.JAA03282@trillium.nmsu.edu> +From: Brook Milligan +To: tgl@sss.pgh.pa.us +CC: dg@informix.com, hackers@postgreSQL.org +In-reply-to: <23042.904573041@sss.pgh.pa.us> (message from Tom Lane on Mon, 31 + Aug 1998 10:17:21 -0400) +Subject: Re: [HACKERS] flock patch breaks things here +References: <23042.904573041@sss.pgh.pa.us> +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: ROr + + I just came up with an idea that might help alleviate the /tmp security + exposure without creating a backwards-compatibility problem. It works + like this: + + 1. During installation, create a subdirectory of /tmp to hold Postgres' + socket files and associated pid lockfiles. This subdirectory should be + owned by the Postgres superuser and have permissions 755 + (world-readable, writable only by Postgres superuser). Maybe call it + /tmp/.pgsql --- the name should start with a dot to keep it out of the + way. (Bruce points out that some systems clear /tmp during reboot, so + it might be that a postmaster will have to be prepared to recreate this + directory at startup --- anyone know if subdirectories of /tmp are + zapped too? My system doesn't do that...) + + ... + + I notice that on my system, the X11 socket files in /tmp/.X11-unix are + actually symlinks to socket files in /usr/spool/sockets/X11. I dunno if + it's worth our trouble to get into putting our sockets under /usr/spool + or /var/spool or whatever --- seems like another configuration choice to + mess up. It'd be nice if the socket directory lived somewhere where the + parent dirs weren't world-writable, but this would mean one more thing + that you have to have root permissions for in order to install pgsql. + +It seems like we need a directory for locks (= pid files) and one for +sockets (perhaps the same one). I strongly suggest that the location +for these be configurable. By default, it might make sense to put +them in ~pgsql/locks and ~pgsql/sockets. It is easy (i.e., I'll be +glad to do it) to modify configure.in to take options like + + --lock-dir=/var/spool/lock + --socket-dir=/var/spool/sockets + +that set cc defines and have the code respond accordingly. This way, +those who don't care (or don't have root access) can use the defaults, +whereas those with root access who like to keep locks and sockets in a +common place can do so easily. Either way, multiple postmasters (all +compiled with the same options of course) can check the appropriate +locks in the well-known places. Finally, drop the link into /tmp for +the old socket and document that it will be disappearing at some +point, and all is fine. + +If someone wants to give me some guidance on what preprocessor +variables should be set in response to the above options (or something +like them), I'll do the configure stuff. + +Cheers, +Brook + + diff --git a/doc/TODO.detail/fsync b/doc/TODO.detail/fsync new file mode 100644 index 0000000000..f36366ca52 --- /dev/null +++ b/doc/TODO.detail/fsync @@ -0,0 +1,69 @@ +From owner-pgsql-general@hub.org Fri Dec 18 06:31:23 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id GAA05554 + for ; Fri, 18 Dec 1998 06:31:21 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id EAA21127 for ; Fri, 18 Dec 1998 04:46:38 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id EAA01409; + Fri, 18 Dec 1998 04:44:19 -0500 (EST) + (envelope-from owner-pgsql-general@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 18 Dec 1998 04:43:22 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id EAA01093 + for pgsql-general-outgoing; Fri, 18 Dec 1998 04:43:18 -0500 (EST) + (envelope-from owner-pgsql-general@postgreSQL.org) +Received: from dune.krs.ru (dune.krs.ru [195.161.16.38]) + by hub.org (8.9.1/8.9.1) with ESMTP id EAA01067 + for ; Fri, 18 Dec 1998 04:43:09 -0500 (EST) + (envelope-from vadim@krs.ru) +Received: from krs.ru (localhost.krs.ru [127.0.0.1]) + by dune.krs.ru (8.8.8/8.8.7) with ESMTP id QAA16201; + Fri, 18 Dec 1998 16:41:44 +0700 (KRS) + (envelope-from vadim@krs.ru) +Message-ID: <367A2354.E998763@krs.ru> +Date: Fri, 18 Dec 1998 16:41:40 +0700 +From: Vadim Mikheev +Organization: OJSC Rostelecom (Krasnoyarsk) +X-Mailer: Mozilla 4.5 [en] (X11; I; FreeBSD 2.2.6-RELEASE i386) +X-Accept-Language: ru, en +MIME-Version: 1.0 +To: Anton de Wet +CC: pgsql-general@postgreSQL.org +Subject: Re: [GENERAL] Why PostgreSQL is better than other commerial softwares? +References: +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-general@postgreSQL.org +Precedence: bulk +Status: RO + +Anton de Wet wrote: +> +> > +> > Often quick mailing list support? +> +> :-) +> +> While on the subject I finally found the solution to a problem I (and one +> or two other people) posted about without answer. (So sometimes it's slow +> mailing list support). +> +> In importing about 5 million records (which I copy in blocks of 10000) the +> copy became linearly slower. After a friend RTFM and refered me, I used +> the -F switch (passed by the postmaster to the backend processes) and the +> time became linear and a LOT shorter. Import time for the 5000000 records +> now the same (or maybe even slightly faster, I didn't accurately time +> them) as importing the data into oracle on the same machine. + +"While on the subject..." -:) + +This is the problem of buffer manager, known for very long time: +when copy eats all buffers, manager begins write/fsync each +durty buffer to free buffer for new data. All updated relations +should be fsynced _once_ @ transaction commit. You would get +the same results without -F... +I still have no time to implement this -:( + +Vadim + + diff --git a/doc/TODO.detail/lex b/doc/TODO.detail/lex new file mode 100644 index 0000000000..acccb250a0 --- /dev/null +++ b/doc/TODO.detail/lex @@ -0,0 +1,332 @@ +From selkovjr@mcs.anl.gov Sat Jul 25 05:31:05 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id FAA16564 + for ; Sat, 25 Jul 1998 05:31:03 -0400 (EDT) +Received: from antares.mcs.anl.gov (mcs.anl.gov [140.221.9.6]) by renoir.op.net (o1/$Revision: 1.1 $) with SMTP id FAA01775 for ; Sat, 25 Jul 1998 05:28:22 -0400 (EDT) +Received: from mcs.anl.gov (wit.mcs.anl.gov [140.221.5.148]) by antares.mcs.anl.gov (8.6.10/8.6.10) with ESMTP + id EAA28698 for ; Sat, 25 Jul 1998 04:27:05 -0500 +Sender: selkovjr@mcs.anl.gov +Message-ID: <35B9968D.21CF60A2@mcs.anl.gov> +Date: Sat, 25 Jul 1998 08:25:49 +0000 +From: "Gene Selkov, Jr." +Organization: MCS, Argonne Natl. Lab +X-Mailer: Mozilla 4.03 [en] (X11; I; Linux 2.0.32 i586) +MIME-Version: 1.0 +To: Bruce Momjian +Subject: position-aware scanners +References: <199807250524.BAA07296@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: RO + +Bruce, + +I attached here (trough the web links) a couple examples, totally +irrelevant to postgres but good enough to discuss token locations. I +might as well try to patch the backend parser, though not sure how soon. + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +1. + +The first c parser I wrote, +http://wit.mcs.anl.gov/~selkovjr/unit-troff.tgz, is not very +sophisticated, so token locations reported by yyerr() may be slightly +incorrect (+/- one position depending on the existence and type of the +lookahead token. It is a filter used to typeset the units of measurement +with eqn. To use it, unpack the tar file and run make. The Makefile is +not too generic but I built it on various systems including linux, +freebsd and sunos 4.3. The invocation can be something like this: + +./check 0 parse "l**3/(mmoll*min)" +parse error, expecting `BASIC_UNIT' or `INTEGER' or `POSITIVE_NUMBER' or +`'('' + +l**3/(mmoll*min) + ^^^^^ + +Now to the guts. As far as I can imagine, the only way to consistently +keep track of each character read by the scanner (regardless of the +length of expressions it will match) is to redefine its YY_INPUT like +this: + +#undef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ +{ \ + int c = (int) buffer[pos++]; \ + result = (c == '\0') ? YY_NULL : (buf[0] = c, 1); \ +} + +Here, buffer is the pointer to the origin of the string being scanned +and pos is a global variable, similar in usage to a file pointer (you +can both read and manipulate it at will). The buffer and the pointer are +initialized by the function + +void setString(char *s) +{ + buffer = s; + pos = 0; +} + +each time the new string is to be parsed. This (exportable) function is +part of the interface. + +In this simplistic design, yyerror() is part of the scanner module and +it uses the pos variable to report the location of unexpected tokens. +The downside of such arrangement is that in case of error condition, you +can't easily tell whether your context is current or lookahead token, it +just reports the position of the last token read (be it $ (end of +buffer) or something else): + +./check 0 convert "mol/foo" +parse error, expecting `BASIC_UNIT' or `INTEGER' or `POSITIVE_NUMBER' or +`'('' + +mol/foo + ^^^ + +(should be at the beginning of "foo") + +./check 0 convert "mmol//l" +parse error, expecting `BASIC_UNIT' or `INTEGER' or `POSITIVE_NUMBER' or +`'('' + +mmol//l + ^ + +(should be at the second '/') + + +I believe this is why most simple parsers made with yacc would report +parse errors being "at or near" some token, which is fair enough if the +expression is not too complex. + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +2. The second version of the same scanner, +http://wit.mcs.anl.gov/~selkovjr/scanner-example.tgz, addresses this +problem by recording exact locations of the tokens in each instance of +the token semantic data structure. The global, + +UNIT_YYSTYPE unit_yylval; + +would be normally used to export the token semantics (including its +original or modified text and location data) to the parser. +Unfortunately, I cannot show you the parser part in c, because that's +about when I stopped writing parsers in c. Instead, I included a small +test program, test.c, that mimics the parser's expectations for the +scanner data pretty well. I am assuming here that you are not interested +in digging someone else's ugly guts for relatively small bit of +information; let me know if I am wrong and I will send you the complete +perl code (also generated with bison). + +To run this example, unpack the tar file and run Make. Then do + + gcc test.c scanner.o + +and run a.out + +Note the line + + yylval = unit_getyylval(); + +in test.c. You will not normally need it in a c parser. It is enough to +define yylval as an external variable and link it to yylval in yylex() + +In the bison-generated parser, yylval gets pushed into a stack (pointed +to by yylsp) each time a new token is read. For each syntax rule, the +bison macros @1, @2, ... are just shortcuts to locations in the stack 1, +2, ... levels deep. In following code fragment, @3 refers to the +location info for the third term in the rule (INTEGER): + +(sorry about perl, but I think you can do the same things in c without +significant changes to your existing parser) + +term: base { + $$ = $1; + $$->{'order'} = 1; + } + | base EXP INTEGER { + $$ = $1; + $$->{'order'} = @3->{'text'}; + $$->{'scale'} = $$->{'scale'} ** $$->{'order'}; + if ( $$->{'order'} == 0 ) { + yyerror("Error: expecting a non-zero +integer exponent"); + YYERROR; + } + } + + +which translates to: + + ($yyn == 10) && do { + $yyval = $yyvsa[-1]; + $yyval->{'order'} = 1; + last SWITCH; + }; + + ($yyn == 11) && do { + $yyval = $yyvsa[-3]; + $yyval->{'order'} = $yylsa[-1]->{'text'} + $yyval->{'scale'} = $yyval->{'scale'} ** $yyval->{'order'}; + if ( $yyval->{'order'} == 0 ) { + yyerror("Error: expecting a non-zero integer +exponent"); + goto yyerrlab1 ; + } + last SWITCH; + }; + +In c, you will have a bit more complicated pointer arithmetic to adress +the stack, but the usage of objects will be the same. Note here that it +is convenient to keep all information about the token in its location +info, (yylsa, yylsp, yylval, @n), while everything relating to the value +of the expression, or to the parse tree, is better placed in the +semantic stack (yyssa, yyssp, yysval, $n). Also note that in some cases +you can do semantic checks inside rules and report useful messages +before or instead of invoking yyerror(); + +Finally, it is useful to make the following wrapper function around +external yylex() in order to maintain your own token stack. Unlike the +parser's internal stack which is only as deep as the rule being reduced, +this one can hold all tokens recognized during the current run, and that +can be extremely helpful for error reporting and any transformations you +may need. In this way, you can even scan (tokenize) the whole buffer +before handing it off to the parser (who knows, you may need a token +ahead of what is currently seen by the parser): + + +sub tokenize { + undef @tokenTable; + my ($tok, $text, $name, $unit, $first_line, $first_column, +$last_line, $last_column); + + while ( ($tok = &UnitLex::yylex()) > 0 ) { # this is where the +c-coded yylex is called, + # UnitLex is the perl +extension encapsulating it + ( $text, $name, $unit, $first_line, $first_column, $last_line, +$last_column ) = &UnitLex::getyylval; + push(@tokenTable, + Unit::yyltype->new ( + 'token' => $tok, + 'text' => $text, + 'name' => $name, + 'unit' => $unit, + 'first_line' => $first_line, + 'first_column' => $first_column, + 'last_line' => $last_line, + 'last_column' => $last_column, + ) + ) + } + +} + + +It is now a lot easier to handle various state-related problems, such as +backtracking and error reporting. The yylex() function as seen by the +parser might be constructed somewhat like this: + +sub yylex { + $yylloc = $tokenTable[$tokenNo]; # $tokenNo is a global; now +instead of a "file pointer", + # as in the first example, we have +a "token pointer" + undef $yylval; + + + # disregard this; name this block "computing semantic values" + if ( $yylloc->{'token'} == UNIT) { + $yylval = Unit::Operand->new( + 'unit' => Unit::Dict::unit($yylloc->{'unit'}), + 'base' => Unit::Dict::base($yylloc->{'unit'}), + 'scale' => Unit::Dict::scale($yylloc->{'unit'}), + 'scaleToBase' => Unit::Dict::scaleToBase($yylloc->{'unit'}), + 'loc' => $yylloc, + ); + } + elsif ( ($yylloc->{'token'} == INTEGER ) || ($yylloc->{'token'} == +POSITIVE_NUMBER) ) { + $yylval = Unit::Operand->new( + 'unit' => '1', + 'base' => '1', + 'scale' => 1, + 'scaleToBase' => 1, + 'loc' => $yylloc, + ); + } + + $tokenNo++; + return(%{$yylloc}->{'token'}); # This is all the parser needs to +know about this token. + # But we already made sure we saved +everything we need to know. +} + + +Now the most interesting part, the error reporting routine: + + +sub yyerror { + my ($str) = @_; + my ($message, $start, $end, $loc); + + $loc = $tokenTable[$tokenNo-1]; # This is the same as to say, + # "obtain the location info for the +current token" + + # You may use this routine for your own purposes or let parser use +it + if( $str ne 'parse error' ) { + $message = "$str instead of `" . $loc->{'name'} . "' <" . +$loc->{'text'} . ">, at line " . $loc->{'first_line'} . ":\n\ +n"; + } + else { + $message = "unexpected token `" . $loc->{'name'} . "' <" . +$loc->{'text'} . ">, at line " . loc->{'first_line'} . ":\n +\n"; + } + + $message .= $parseBuffer . "\n"; # that's the original string that +was used to set the parser buffer + + $message .= ( ' ' x ($loc->{'first_column'} + 1) ) . ( '^' x +length($loc->{'text'}) ). "\n"; + if( $str ne 'parse error' ) { + print STDERR "$str instead of `", $loc->{'name'}, "' {", +$loc->{'text'}, "}, at line ", $loc->{'first_line'}, ":\n\n"; + } + else { + print STDERR "unexpected token `", $loc->{'name'}, "' {", +$loc->{'text'}, "}, at line ", $loc->{'first_line'}, ":\n\n"; + } + + print STDERR "$parseBuffer\n"; + print STDERR ' ' x ($loc->{'first_column'} + 1), '^' x +length($loc->{'text'}), "\n"; +} + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Scanners used in these examples assume there is a single line of text on +the input (the first_line and last_line elements of yylloc are simply +ignored). If you want to be able to parse multi-line buffers, just add a +lex rule for '\n' that will increment the line count and reset the pos +variable to zero. + + +Ugly as it may seem, I find this approach extremely liberating. If the +grammar becomes too complicated for a LALR(1) parser, I can cascade +multiple parsers. The token table can then be used to reassemble parts +of original expression for subordinate parsers, preserving the location +info all the way down, so that subordinate parsers can report their +problems consistently. You probably don't need this, as SQL is very well +thought of and has parsable grammar. But it may be of some help, for +error reporting. + + +--Gene + diff --git a/doc/TODO.detail/limit b/doc/TODO.detail/limit new file mode 100644 index 0000000000..401d08c67a --- /dev/null +++ b/doc/TODO.detail/limit @@ -0,0 +1,5708 @@ +From owner-pgsql-hackers@hub.org Tue Oct 13 15:05:53 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id PAA09435 + for ; Tue, 13 Oct 1998 15:05:50 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id OAA11700; + Tue, 13 Oct 1998 14:43:31 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 13 Oct 1998 14:41:03 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id OAA11395 + for pgsql-hackers-outgoing; Tue, 13 Oct 1998 14:41:00 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from terry1.acun.com (terry@terry1.acun.com [206.27.86.12]) + by hub.org (8.8.8/8.8.8) with ESMTP id OAA11372 + for ; Tue, 13 Oct 1998 14:40:54 -0400 (EDT) + (envelope-from terry@terrym.com) +Received: from localhost (terry@localhost) + by terry1.acun.com (8.8.5/8.8.5) with SMTP id OAA09491 + for ; Tue, 13 Oct 1998 14:53:22 -0400 +Date: Tue, 13 Oct 1998 14:53:22 -0400 (EDT) +From: Terry Mackintosh +X-Sender: terry@terry1.acun.com +Reply-To: Terry Mackintosh +To: PostgreSQL-development +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Hi, my 2 cents... + +I agree completely, LIMIT would be VERY usefull in web based apps, which +is all I run. It does not matter to me if it is not part of a formal +standard. The idea is so common that it is a defacto standard. + +I would not expect it for this release, but could it get put on the TODO +list for next time? I am even willing to work at an apprentise level on +this with a more expeireanced person that knows this stuff. + +A note on implimentation: +I *used to* :) work with VFP on NT's :( +And the way VFP did LIMIT, it would only return the number of rows asked +for, BUT it still did the WHOLE search! +So on a larger table, which we had (property tax database for the county), +if some one put in too vague a query, it would try to collect ALL of the +rows as the initial result set, then give you the first x rows of that. + +This did save on pushing mass amounts of data out to the browser, but it +would have been even better if it could have simply aborted the select +after having found x rows. + +Also, it did not have the concept of an offset, so one could not select +100 rows, starting 200 rows in, which would be REALLY usefull for "paging" +through data. I do not know if mySQL or any other has such a concept +either, but it would be nice. + +So a properly implemented "LIMIT" could: +1. Save pushing mass amounts of data across the web, that no one wants +any way. +2. Stop vague queries from bogging down the server. +(On very larg tables this could be critical!) +3. Enable "Paging" of data. (easyer then now (app. level)) +4. Would be a very nice feather in PostgreSQL's cap that could make it +even more attractive to those looking at all sorts of databases out there. + +Have a great day. + +On Tue, 13 Oct 1998, Oleg Bartunov wrote: + +> Hi, +> +> I took a look at mysql and was very impressed with possibility +> to limit number of rows returned from select. This is very useful +> feature for Web applications when user need to browse results of +> selection page by page. In my application I have to do full +> select every time user press button [Next] and show requested page +> using perl. This works more or less ok for several thousands rows but +> totally unusable for large selections. But now I'm about to work +> with big database and I don't know how I'll stay with postgres :-) +> It'll just doesn't work if customer will wait several minutes just browse +> next page. Mysql lacks some useful features postgres has +> (subselects, transaction ..) but for most Web applications I need +> just select :-) I dont' know how LIMIT is implemented in Mysql and +> I know it's not in SQL92 standart, but this makes Mysql very popular. +> +> Is it difficult to implement this feature in postgres ? +> +> Regards, +> +> Oleg +> +> +> _____________________________________________________________ +> Oleg Bartunov, sci.researcher, hostmaster of AstroNet, +> Sternberg Astronomical Institute, Moscow University (Russia) +> Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/ +> phone: +007(095)939-16-83, +007(095)939-23-83 +> +> + +Terry Mackintosh http://www.terrym.com +sysadmin/owner Please! No MIME encoded or HTML mail, unless needed. + +Proudly powered by R H Linux 4.2, Apache 1.3, PHP 3, PostgreSQL 6.3 +------------------------------------------------------------------- +Success Is A Choice ... book by Rick Patino, get it, read it! + + + + +From owner-pgsql-hackers@hub.org Tue Oct 13 18:12:41 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id SAA12156 + for ; Tue, 13 Oct 1998 18:12:39 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id RAA04181; + Tue, 13 Oct 1998 17:56:17 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 13 Oct 1998 17:54:49 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id RAA03869 + for pgsql-hackers-outgoing; Tue, 13 Oct 1998 17:54:47 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from remapcorp.com (root@remapcorp.com [206.196.37.193]) + by hub.org (8.8.8/8.8.8) with ESMTP id RAA03838 + for ; Tue, 13 Oct 1998 17:54:36 -0400 (EDT) + (envelope-from jeff@remapcorp.com) +Received: from go-to-jail (gotojail.remapcorp.com [206.196.37.197]) + by remapcorp.com (8.8.7/8.8.7) with SMTP id QAA25337; + Tue, 13 Oct 1998 16:55:35 -0500 (CDT) + (envelope-from jeff@remapcorp.com) +Message-ID: <006701bdf6f4$60ed75f0$c525c4ce@go-to-jail.remapcorp.com> +From: "Jeff Hoffmann" +To: "Marc G. Fournier" , "Eric Lee Green" +Cc: "PostgreSQL-development" +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +Date: Tue, 13 Oct 1998 16:56:48 -0500 +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook Express 4.72.3115.0 +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.3110.3 +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +>On Tue, 13 Oct 1998, Eric Lee Green wrote: +> +>> On Tue, 13 Oct 1998, Jeff Hoffmann wrote: +>> > >I agree completely, LIMIT would be VERY usefull in web based apps, +which +>> > >is all I run. It does not matter to me if it is not part of a formal +>> > >standard. The idea is so common that it is a defacto standard. +>> > +>> > i'm not familiar with mysql and using "LIMIT" but wouldn't this same +effect +>> > be achieved by declaring a cursor and fetching however many records in +the +>> > cursor? it's a very noticeable improvement when you only want the +first 20 +>> > out of 500 in a 200k record database, at least. +>> +>> The problem with declaring a cursor vs. the "LIMIT" clause is that the +>> "LIMIT" clause, if used properly by the database engine (along with the +>> database engine using indexes in "ORDER BY" clauses) allows the database +>> engine to short-circuit the tail end of the query. That is, if you have +25 +>> names and the last one ends with BEAVIS, the database engine doesn't have +>> to go through the BUTTHEADS and KENNYs and etc. +>> +>> Theoretically a cursor is superior to the "LIMIT" clause because you're +>> eventually going to want the B's and K's and etc. anyhow -- but only in a +>> stateful enviornment. In the stateless web environment, a cursor is +>> useless because the connection can close at any time even when you're +>> using "persistent" connections (and of course when the connection closes +>> the cursor closes). +> +>Ookay, I'm sorry, butyou lost me here. I haven't gotten into using +>CURSORs/FETCHs yet, since I haven't need it...but can you give an example +>of what you would want to do using a LIMIT? I may be missing something, +>but wha is the different between using LIMIT to get X records, and +>definiing a cursor to FETCH X records? +> +>Practical example of *at least* the LIMIT side would be good, so that we +>can at least see a physical example of what LIMIT can do that +>CURSORs/FETCH can't... +> + + +fetch with cursors should work properly (i.e., you can short circuit it by +just ending your transaction) my understanding on how this works is exactly +how you explained LIMIT to work. here's some empirical proof from one of my +sample databases: + +the sample table i'm using has 156k records (names of people) +i'm using a PP180 with 128MB RAM and some old slow SCSI drives. + +public_mn=> select count(*) from public_ramsey; + count +------ +156566 +(1 row) + +i did the following query: +public_mn=> select * from public_ramsey where ownerlname ~ 'SMITH'; + +which returned 711 matches and took about 12 seconds. + +i did the same thing with a cursor: + +public_mn=> begin; +BEGIN +public_mn=> declare test cursor for select * from public_ramsey where +ownerlname ~ 'SMITH'; +SELECT + +the select was instantaneous. + +public_mn=> fetch 20 in test; + +returns 20 records almost instantaneously. each additional 20 took less +than a second, as well. + +if this isn't what you're talking about, i don't understand what you're +saying. + +jeff + + + +From eric@ireland.linux-hw.com Tue Oct 13 18:52:42 1998 +Received: from ireland.linux-hw.com (IDENT:eric@ireland.linux-hw.com [199.72.95.215]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id SAA12388 + for ; Tue, 13 Oct 1998 18:52:40 -0400 (EDT) +Received: from localhost (eric@localhost) + by ireland.linux-hw.com (8.8.7/8.8.7) with SMTP id SAA31316; + Tue, 13 Oct 1998 18:55:22 -0400 +Date: Tue, 13 Oct 1998 18:55:22 -0400 (EDT) +From: Eric Lee Green +To: Bruce Momjian +cc: jeff@remapcorp.com, hackers@postgreSQL.org +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: <199810132116.RAA11249@candle.pha.pa.us> +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Status: RO + +On Tue, 13 Oct 1998, Bruce Momjian wrote: +> > Theoretically a cursor is superior to the "LIMIT" clause because you're +> > eventually going to want the B's and K's and etc. anyhow -- but only in a +> > stateful enviornment. In the stateless web environment, a cursor is +> > useless because the connection can close at any time even when you're +> > using "persistent" connections (and of course when the connection closes +> What we could do is _if_ there is only one table(no joins), and an index +> exists that matches the ORDER BY, we could use the index to +> short-circuit the query. + +This is exactly what MySQL does in this situation, except that it can use +the ORDER BY to do the short circuiting even if there is a join involved +if all of the elements of the ORDER BY belong to one table. Obviously if +I'm doing an "ORDER BY table1.foo table2.bar" that isn't going to work! +But "select table1.fsname,table1.lname,table2.receivables where +table2.receivables > 0 and table1.custnum=table2.custnum order by +(table1.lname,table1.fsname) limit 50" can be short-circuited by fiddling +with the join order -- table1.fsname table1.lname have to be the first two +things in the join order. + +Whether this is feasible in PostgreSQL I have no earthly idea. This would +seem to conflict with the join optimizer. + +> happier? If there is an ORDER BY and no index, or a join, I can't +> figure out how we would short-circuit the query. + +If there is an ORDER BY and no index you can't short-circuit the query. +MySQL doesn't either. Under certain circumstances (such as above) you can +short-circuit a join, but it's unclear whether it'd be easy to add such +a capability to PostgreSQL given the current structure of the query +optimizer. (And I certainly am not in a position to tackle it, at the +moment MySQL is sufficing for my project despite the fact that it is +quite limited compared to PostgreSQL, I need to get my project finished +first). + +-- +Eric Lee Green eric@linux-hw.com http://www.linux-hw.com/~eric +"To call Microsoft an innovator is like calling the Pope Jewish ..." + -- James Love (Consumer Project on Technology) + + +From owner-pgsql-hackers@hub.org Wed Oct 14 09:01:01 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id JAA24574 + for ; Wed, 14 Oct 1998 09:01:00 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id HAA17762 for ; Wed, 14 Oct 1998 07:47:57 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id HAA09214; + Wed, 14 Oct 1998 07:04:59 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 07:00:44 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id HAA09116 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 07:00:40 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id HAA09102 + for ; Wed, 14 Oct 1998 07:00:27 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id NAA05037; Wed, 14 Oct 1998 13:02:40 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma004737; Wed, 14 Oct 98 13:02:09 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id MAA20155; + Wed, 14 Oct 1998 12:59:23 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id NAA20772; + Wed, 14 Oct 1998 13:01:35 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTMGL-000B5AC; Wed, 14 Oct 98 10:26 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for eric@linux-hw.com + id m0zTOnx-000EBRC; Wed, 14 Oct 98 13:09 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: eric@linux-hw.com (Eric Lee Green) +Date: Wed, 14 Oct 1998 13:09:21 +0200 (MET DST) +Cc: jeff@remapcorp.com, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: from "Eric Lee Green" at Oct 13, 98 04:24:20 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Eric Lee Green wrote: +> +> On Tue, 13 Oct 1998, Jeff Hoffmann wrote: +> > >I agree completely, LIMIT would be VERY usefull in web based apps, which +> > >is all I run. It does not matter to me if it is not part of a formal +> > >standard. The idea is so common that it is a defacto standard. +> > +> > i'm not familiar with mysql and using "LIMIT" but wouldn't this same effect +> > be achieved by declaring a cursor and fetching however many records in the +> > cursor? it's a very noticeable improvement when you only want the first 20 +> > out of 500 in a 200k record database, at least. +> +> The problem with declaring a cursor vs. the "LIMIT" clause is that the +> "LIMIT" clause, if used properly by the database engine (along with the +> database engine using indexes in "ORDER BY" clauses) allows the database +> engine to short-circuit the tail end of the query. That is, if you have 25 +> names and the last one ends with BEAVIS, the database engine doesn't have +> to go through the BUTTHEADS and KENNYs and etc. +> +> Theoretically a cursor is superior to the "LIMIT" clause because you're +> eventually going to want the B's and K's and etc. anyhow -- but only in a +> stateful enviornment. In the stateless web environment, a cursor is +> useless because the connection can close at any time even when you're +> using "persistent" connections (and of course when the connection closes +> the cursor closes). + + I'm missing something. Well it's right that in the stateless + web environment a cursor has to be declared and closed for + any single CGI call. But even if you have a LIMIT clause, + your CGI must know with which key to start. + + So your query must look like + + SELECT ... WHERE key > 'last processed key' ORDER BY key; + + And your key must be unique (or at least contain no duplicate + entries) or you might miss some rows between the pages (have + 100 Brown's in the table and last processed key was a Brown + while using LIMIT). + + In postgres you could actually do the following (but read on + below - it's not optimized correct) + + BEGIN; + DECLARE c CURSOR FOR SELECT ... WHERE key > 'last' ORDER BY key; + FETCH 20 IN c; + (process the 20 rows in CGI) + CLOSE c; + COMMIT; + + Having LIMIT looks more elegant and has less overhead in CGI- + backend communication. But the cursor version is SQL + standard and portable. + +> +> I wanted very badly to use PostgreSQL for a web project I'm working on, +> but it just wouldn't do the job :-(. + + I've done some tests and what I found out might be a bug in + PostgreSQL's query optimizer. Having a table with 25k rows + where key is a text field with a unique index. Now I used + EXPLAIN for some queries + + SELECT * FROM tab; + + results in a seqscan - expected. + + SELECT * FROM tab ORDER BY key; + + results in a sort->seqscan - I would have + expected an indexscan! + + SELECT * FROM tab WHERE key > 'G'; + + results in an indexscan - expected. + + SELECT * FROM tab WHERE key > 'G' ORDER BY key; + + results in a sort->indexscan - hmmm. + + These results stay the same even if I blow up the table by + duplicating all rows (now with a non-unique index) to 100k + rows and have them presorted in the table. + + Needless to say that everything is vacuum'd for statistics. + + The last one is the query we would need in the web + environment used over a cursor as in the example above. But + due to the sort, the backend selects until the end of the + table, sorts them and then returns only the first 20 rows + (out of sorts result). + + This is very painful if the qualification (key > ...) points + to the beginning of the key list. + + Looking at planner.c I can see, that if there is a sortClause + in the parsetree, the planner creates a sort node and does + absolutely not check if there is an index that could be used + to do it. In the examples above, the sort is absolutely + needless because the index scan will already return the + tuples in the right order :-). + + Somewhere deep in my brain I found a statement that sorting + sorted data isn't only unnecessary (except the order + changes), it is slow too compared against sorting randomly + ordered data. + + Can we fix this before 6.4 release, will it be a past 6.4 or + am I doing something wrong here? I think it isn't a fix (it's + a planner enhancement) so it should really be a past 6.4 + item. + + For now, the only possibility is to omit the ORDER BY in the + query and hope the planner will always generate an index scan + (because of the qualification 'key > ...'). Doing so I + selected multiple times 20 rows (with the last key qual like + a CGI would do) in separate transactions. Using cursor and + fetch speeds up the access by a factor of 1000! But it is + unsafe and thus NOT RECOMMENDED! It's only a test if cursors + can do the LIMIT job - and they could if the planner would do + a better job. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 11:02:04 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA25519 + for ; Wed, 14 Oct 1998 11:02:02 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id JAA24583 for ; Wed, 14 Oct 1998 09:46:21 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id IAA17022; + Wed, 14 Oct 1998 08:59:20 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 08:54:40 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id IAA16687 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 08:54:34 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from ra.sai.msu.su (ra.sai.msu.su [158.250.29.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id IAA16656 + for ; Wed, 14 Oct 1998 08:54:00 -0400 (EDT) + (envelope-from oleg@sai.msu.su) +Received: from ra (ra [158.250.29.2]) + by ra.sai.msu.su (8.9.1/8.9.1) with SMTP id PAA11714; + Wed, 14 Oct 1998 15:53:53 +0300 (MSK) +Date: Wed, 14 Oct 1998 16:53:53 +0400 (MSD) +From: Oleg Bartunov +X-Sender: megera@ra +Reply-To: Oleg Bartunov +To: hackers@postgreSQL.org +cc: t-ishii@sra.co.jp +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: +Message-ID: +Organization: Sternberg Astronomical Institute (Moscow University) +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +On Wed, 14 Oct 1998, Jan Wieck wrote: + +> Date: Wed, 14 Oct 1998 13:09:21 +0200 (MET DST) +> From: Jan Wieck +> To: Eric Lee Green +> Cc: jeff@remapcorp.com, hackers@postgreSQL.org +> Subject: Re: [HACKERS] What about LIMIT in SELECT ? +> +> Eric Lee Green wrote: +> > +> > On Tue, 13 Oct 1998, Jeff Hoffmann wrote: +> > > >I agree completely, LIMIT would be VERY usefull in web based apps, which +> > > >is all I run. It does not matter to me if it is not part of a formal +> > > >standard. The idea is so common that it is a defacto standard. +> > > +> > > i'm not familiar with mysql and using "LIMIT" but wouldn't this same effect +> > > be achieved by declaring a cursor and fetching however many records in the +> > > cursor? it's a very noticeable improvement when you only want the first 20 +> > > out of 500 in a 200k record database, at least. +> > +> > The problem with declaring a cursor vs. the "LIMIT" clause is that the +> > "LIMIT" clause, if used properly by the database engine (along with the +> > database engine using indexes in "ORDER BY" clauses) allows the database +> > engine to short-circuit the tail end of the query. That is, if you have 25 +> > names and the last one ends with BEAVIS, the database engine doesn't have +> > to go through the BUTTHEADS and KENNYs and etc. +> > +> > Theoretically a cursor is superior to the "LIMIT" clause because you're +> > eventually going to want the B's and K's and etc. anyhow -- but only in a +> > stateful enviornment. In the stateless web environment, a cursor is +> > useless because the connection can close at any time even when you're +> > using "persistent" connections (and of course when the connection closes +> > the cursor closes). +> +> I'm missing something. Well it's right that in the stateless +> web environment a cursor has to be declared and closed for +> any single CGI call. But even if you have a LIMIT clause, +> your CGI must know with which key to start. +> + This is not a problem for CGI-script to know which key to start. + Without LIMIT every CGI call backend will do *FULL* selection + and cursor helps just in fetching a definite number of rows, + in principle I can do this with CGI-script. Also, cursor + returns data back in ASCII format (man l declare) and this requires + additional job for backend to convert data from intrinsic (binary) + format. Right implementation of LIMIT offset,number_of_rows could be + a great win and make postgres superior free database engine for + Web applications. Many colleagues of mine used mysql instead of + postgres just because of lacking LIMIT. Tatsuo posted a patch + for set query_limit to 'num', I just tested it and seems it + works fine. Now, we need only possibility to specify offset, + say + set query_limit to 'offset,num' + ( Tatsuo, How difficult to do this ?) + and LIMIT problem will ne gone. + + I'm wonder how many useful patches could be hidden from people :-), + + Regards, + + Oleg + +PS. + + Tatsuo, do you have patch for 6.3.2 ? + I can't wait for 6.4 :-) +_____________________________________________________________ +Oleg Bartunov, sci.researcher, hostmaster of AstroNet, +Sternberg Astronomical Institute, Moscow University (Russia) +Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/ +phone: +007(095)939-16-83, +007(095)939-23-83 + + + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 11:02:00 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA25510 + for ; Wed, 14 Oct 1998 11:01:59 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id KAA28854 for ; Wed, 14 Oct 1998 10:40:56 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id KAA21542; + Wed, 14 Oct 1998 10:03:45 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 09:59:10 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id JAA21121 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 09:59:08 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from golem.jpl.nasa.gov (root@hectic-2.jpl.nasa.gov [128.149.68.204]) + by hub.org (8.8.8/8.8.8) with ESMTP id JAA21106 + for ; Wed, 14 Oct 1998 09:59:02 -0400 (EDT) + (envelope-from lockhart@alumni.caltech.edu) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id NAA19587; + Wed, 14 Oct 1998 13:59:56 GMT +Message-ID: <3624AE5C.752E4E7F@alumni.caltech.edu> +Date: Wed, 14 Oct 1998 13:59:56 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.30 i686) +MIME-Version: 1.0 +To: Jan Wieck +CC: Eric Lee Green , jeff@remapcorp.com, + hackers@postgreSQL.org +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +References: +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> I've done some tests and what I found out might be a bug in +> PostgreSQL's query optimizer. +> SELECT * FROM tab ORDER BY key; +> results in a sort->seqscan - I would have +> expected an indexscan! + +Given that a table _could_ be completely unsorted on disk, it is +probably reasonable to suck the data in for a possible in-memory sort +rather than skipping around the disk to pick up individual tuples via +the index. Don't know if vacuum has a statistic on "orderness"... + +> SELECT * FROM tab WHERE key > 'G' ORDER BY key; +> results in a sort->indexscan - hmmm. +> The last one is the query we would need in the web +> environment used over a cursor as in the example above. But +> due to the sort, the backend selects until the end of the +> table, sorts them and then returns only the first 20 rows +> (out of sorts result). + +So you are saying that for this last case the sort was unnecessary? Does +the backend traverse the index in the correct order to guarantee that +the tuples are coming out already sorted? Does a hash index give the +same plan (I would expect a sort->seqscan for a hash index)? + + - Tom + + +From owner-pgsql-hackers@hub.org Wed Oct 14 11:01:52 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA25504 + for ; Wed, 14 Oct 1998 11:01:51 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id KAA00198 for ; Wed, 14 Oct 1998 10:57:15 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id KAA22877; + Wed, 14 Oct 1998 10:19:47 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 10:15:44 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id KAA22675 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 10:15:41 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id KAA22657 + for ; Wed, 14 Oct 1998 10:15:32 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id QAA20563; Wed, 14 Oct 1998 16:18:02 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma020404; Wed, 14 Oct 98 16:17:25 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA05077; + Wed, 14 Oct 1998 16:14:48 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA22248; + Wed, 14 Oct 1998 16:17:06 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTPJb-000B5AC; Wed, 14 Oct 98 13:42 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for hackers@postgreSQL.org + id m0zTRrE-000EBRC; Wed, 14 Oct 98 16:24 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: oleg@sai.msu.su +Date: Wed, 14 Oct 1998 16:24:56 +0200 (MET DST) +Cc: hackers@postgreSQL.org, t-ishii@sra.co.jp +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: from "Oleg Bartunov" at Oct 14, 98 04:53:53 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Oleg Bartunov wrote: +> This is not a problem for CGI-script to know which key to start. + + Never meant that would be a problem. A FORM variable will of + course do this. + +> Without LIMIT every CGI call backend will do *FULL* selection +> and cursor helps just in fetching a definite number of rows, +> in principle I can do this with CGI-script. Also, cursor +> returns data back in ASCII format (man l declare) and this requires +> additional job for backend to convert data from intrinsic (binary) +> format. Right implementation of LIMIT offset,number_of_rows could be +> a great win and make postgres superior free database engine for +> Web applications. Many colleagues of mine used mysql instead of + + That's the point I was missing. The offset! + +> postgres just because of lacking LIMIT. Tatsuo posted a patch +> for set query_limit to 'num', I just tested it and seems it +> works fine. Now, we need only possibility to specify offset, +> say +> set query_limit to 'offset,num' +> ( Tatsuo, How difficult to do this ?) +> and LIMIT problem will ne gone. + + Think you haven't read my posting completely. Even with the + executor limit, the complete scan into the sort is done by + the backend. You need to specify ORDER BY to get the same + list again (without the offset doesn't make sense). But + currently, ORDER BY forces a sort node into the query plan. + + What the executor limit tells is how many rows will be + returned from the sorted data. Not what goes into the sort. + Filling the sort and sorting the data consumes the most time + of the queries execution. + + I haven't looked at Tatsuo's patch very well. But if it + limits the amount of data going into the sort (on ORDER BY), + it will break it! The requested ordering could be different + from what the choosen index might return. The used index is + choosen by the planner upon the qualifications given, not the + ordering wanted. + + So if you select WHERE b = 1 ORDER BY a, then it will use an + index on attribute b to match the qualification. The complete + result of that index scan goes into the sort to get ordered + by a. If now the executor limit stops sort filling after the + limit is exceeded, only the same tuples will go into the sort + every time. But they have nothing to do with the requested + order by a. + + What LIMIT first needs is a planner enhancement. In file + backend/optimizer/plan/planner.c line 284 it must be checked + if the actual plan is an indexscan, if the indexed attributes + are all the same as those in the given sort clause and that + the requested sort order (operator) is that what the index + will return. If that all matches, it can ignore the sort + clause and return the index scan itself. + + Second enhancement must be the handling of the offset. In + the executor, the index scan must skip offset index tuples + before returning the first. But NOT if the plan isn't a + 1-table-index-scan. In that case the result tuples (from the + topmost unique/join/whatever node) have to be skipped. + + With these enhancements, the index tuples to be skipped + (offset) will still be scanned, but not the data tuples they + point to. Index scanning might be somewhat faster. + + This all will only speedup simple 1-table-queries, no joins + or if the requested order isn't that what the index exactly + returns. + + Anyway, I'll take a look if I can change the planner to omit + the sort if the tests described above are true. I think it + would be good anyway. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 11:01:36 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA25489 + for ; Wed, 14 Oct 1998 11:01:34 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id KAA24286; + Wed, 14 Oct 1998 10:30:14 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 10:26:34 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id KAA23732 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 10:26:27 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id KAA23717 + for ; Wed, 14 Oct 1998 10:26:13 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id QAA25644; Wed, 14 Oct 1998 16:28:01 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma025301; Wed, 14 Oct 98 16:27:43 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA05943; + Wed, 14 Oct 1998 16:24:42 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA22339; + Wed, 14 Oct 1998 16:26:57 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTPT8-000B5AC; Wed, 14 Oct 98 13:51 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for lockhart@alumni.caltech.edu + id m0zTS0m-000EBRC; Wed, 14 Oct 98 16:34 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: lockhart@alumni.caltech.edu (Thomas G. Lockhart) +Date: Wed, 14 Oct 1998 16:34:47 +0200 (MET DST) +Cc: jwieck@debis.com, eric@linux-hw.com, jeff@remapcorp.com, + hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <3624AE5C.752E4E7F@alumni.caltech.edu> from "Thomas G. Lockhart" at Oct 14, 98 01:59:56 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> +> > SELECT * FROM tab WHERE key > 'G' ORDER BY key; +> > results in a sort->indexscan - hmmm. +> > The last one is the query we would need in the web +> > environment used over a cursor as in the example above. But +> > due to the sort, the backend selects until the end of the +> > table, sorts them and then returns only the first 20 rows +> > (out of sorts result). +> +> So you are saying that for this last case the sort was unnecessary? Does +> the backend traverse the index in the correct order to guarantee that +> the tuples are coming out already sorted? Does a hash index give the +> same plan (I would expect a sort->seqscan for a hash index)? + + Good point! As far as I can see, the planner chooses index + usage only depending on the WHERE clause. A hash index is + only usable when the given qualification uses = on the + indexed attribute(s). + + If the sortClause exactly matches the indexed attributes of + the ONE used btree index and all operators request ascending + order I think the index scan already returns the correct + order. Who know's definitely? + + Addition to my last posting: ... and if the index scan is + using a btree index ... + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 13:55:58 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA29300 + for ; Wed, 14 Oct 1998 13:55:56 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA14245 for ; Wed, 14 Oct 1998 13:49:19 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA13110; + Wed, 14 Oct 1998 13:25:55 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 13:22:14 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA12694 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 13:22:13 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from candle.pha.pa.us (maillist@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.8.8/8.8.8) with ESMTP id NAA12677 + for ; Wed, 14 Oct 1998 13:22:05 -0400 (EDT) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id NAA28746; + Wed, 14 Oct 1998 13:21:15 -0400 (EDT) +From: Bruce Momjian +Message-Id: <199810141721.NAA28746@candle.pha.pa.us> +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: <3624AE5C.752E4E7F@alumni.caltech.edu> from "Thomas G. Lockhart" at "Oct 14, 1998 1:59:56 pm" +To: lockhart@alumni.caltech.edu (Thomas G. Lockhart) +Date: Wed, 14 Oct 1998 13:21:15 -0400 (EDT) +Cc: jwieck@debis.com, eric@linux-hw.com, jeff@remapcorp.com, + hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> > I've done some tests and what I found out might be a bug in +> > PostgreSQL's query optimizer. +> > SELECT * FROM tab ORDER BY key; +> > results in a sort->seqscan - I would have +> > expected an indexscan! +> +> Given that a table _could_ be completely unsorted on disk, it is +> probably reasonable to suck the data in for a possible in-memory sort +> rather than skipping around the disk to pick up individual tuples via +> the index. Don't know if vacuum has a statistic on "orderness"... + +Thomas is correct on this. Vadim has run some tests, and with our +optimized psort() code, the in-memory sort is often faster than using +the index to get the tuple, because you are jumping all over the drive. +I don't remember, but obviously there is a break-even point where +getting X rows using the index on a table of Y rows is faster , but +getting X+1 rows on a table of Y rows is faster getting all the rows +sequentailly, and doing the sort. + +You would have to pick only certain queries(no joins, index matches +ORDER BY), take the number of rows requested, and the number of rows +selected, and figure out if it is faster to use the index, or a +sequential scan and do the ORDER BY yourself. + + +Add to this the OFFSET capability. I am not sure how you are going to +get into the index and start at the n-th entry, unless perhaps you just +sequential scan the index. + +In fact, many queries just get column already indexed, and we could just +pull the data right out of the index. + +I have added this to the TODO list: + + * Pull requested data directly from indexes, bypassing heap data + +I think this has to be post-6.4 work, but I think we need to work in +this direction. I am holding off any cnfify fixes for post-6.4, but a +6.4.1 performance release certainly is possible. + + +But, you are correct that certain cases where in index is already being +used on a query, you could just skip the sort IF you used the index to +get the rows from the base table. + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + +From owner-pgsql-hackers@hub.org Wed Oct 14 13:55:59 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA29303 + for ; Wed, 14 Oct 1998 13:55:58 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA13463 for ; Wed, 14 Oct 1998 13:39:05 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA11655; + Wed, 14 Oct 1998 13:13:32 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 13:09:41 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA11013 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 13:09:39 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from terry1.acun.com (terry@terry1.acun.com [206.27.86.12]) + by hub.org (8.8.8/8.8.8) with ESMTP id NAA10997 + for ; Wed, 14 Oct 1998 13:09:30 -0400 (EDT) + (envelope-from terry@terrym.com) +Received: from localhost (terry@localhost) + by terry1.acun.com (8.8.5/8.8.5) with SMTP id NAA14478; + Wed, 14 Oct 1998 13:21:51 -0400 +Date: Wed, 14 Oct 1998 13:21:51 -0400 (EDT) +From: Terry Mackintosh +X-Sender: terry@terry1.acun.com +To: Jeff Hoffmann +cc: PostgreSQL-development +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: <005101bdf6de$f9345150$c525c4ce@go-to-jail.remapcorp.com> +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +On Tue, 13 Oct 1998, Jeff Hoffmann wrote: + +> >Hi, my 2 cents... +> > +> >I agree completely, LIMIT would be VERY usefull in web based apps, which +> >is all I run. It does not matter to me if it is not part of a formal +> >standard. The idea is so common that it is a defacto standard. +> +> i'm not familiar with mysql and using "LIMIT" but wouldn't this same effect +> be achieved by declaring a cursor and fetching however many records in the +> cursor? it's a very noticeable improvement when you only want the first 20 +> out of 500 in a 200k record database, at least. + +Yes, while this is an improvement, it still has to do the entire query, +would be nice if the query could be terminated after a designated number +of rows where found, thus freeing system resources that are other wise +consumed. +I have seen web users run ridculous querys, like search for the +letter 'a', and it happens to be a substring search, now the box go'es ape +shit for 5 or 10 min.s while it basically gets the whole db as the search +result. All this befor you can do a 'FETCH', as I understand FETCH, I +will need to read up on it. + +Note that I do not have any databases that larg on my box, I was thinking +back to my VFP/NT experiances. + +Have a great day +Terry Mackintosh http://www.terrym.com +sysadmin/owner Please! No MIME encoded or HTML mail, unless needed. + +Proudly powered by R H Linux 4.2, Apache 1.3, PHP 3, PostgreSQL 6.3 +------------------------------------------------------------------- +Success Is A Choice ... book by Rick Patino, get it, read it! + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 13:59:05 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA29345 + for ; Wed, 14 Oct 1998 13:58:59 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA14021; + Wed, 14 Oct 1998 13:32:51 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 13:29:09 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA13364 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 13:29:07 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from terry1.acun.com (terry@terry1.acun.com [206.27.86.12]) + by hub.org (8.8.8/8.8.8) with ESMTP id NAA13328 + for ; Wed, 14 Oct 1998 13:28:56 -0400 (EDT) + (envelope-from terry@terrym.com) +Received: from localhost (terry@localhost) + by terry1.acun.com (8.8.5/8.8.5) with SMTP id NAA14606 + for ; Wed, 14 Oct 1998 13:41:25 -0400 +Date: Wed, 14 Oct 1998 13:41:24 -0400 (EDT) +From: Terry Mackintosh +X-Sender: terry@terry1.acun.com +To: PostgreSQL-development +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: <199810132116.RAA11249@candle.pha.pa.us> +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +On Tue, 13 Oct 1998, Bruce Momjian wrote: + +> What we could do is _if_ there is only one table(no joins), and an index +> exists that matches the ORDER BY, we could use the index to +> short-circuit the query. +> +> I have added this item to the TODO list: +> +> * Allow LIMIT ability on single-table queries that have no ORDER BY or +> a matching index +> +> This looks do-able, and a real win. Would this make web applications +> happier? If there is an ORDER BY and no index, or a join, I can't +> figure out how we would short-circuit the query. +> +Yes, this would do for most of my apps. +It may just be my lack of sophistication, but I find that most web apps +are very simple in nature/table layout, and thus queries are often on only +a single table. + +Thanks +Terry Mackintosh http://www.terrym.com +sysadmin/owner Please! No MIME encoded or HTML mail, unless needed. + +Proudly powered by R H Linux 4.2, Apache 1.3, PHP 3, PostgreSQL 6.3 +------------------------------------------------------------------- +Success Is A Choice ... book by Rick Patino, get it, read it! + + + +From wieck@sapserv.debis.de Wed Oct 14 13:55:53 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA29290 + for ; Wed, 14 Oct 1998 13:55:51 -0400 (EDT) +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA14370 for ; Wed, 14 Oct 1998 13:51:19 -0400 (EDT) +Received: by dsh.de; id TAA03418; Wed, 14 Oct 1998 19:50:18 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma003369; Wed, 14 Oct 98 19:49:51 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id TAA16746; + Wed, 14 Oct 1998 19:47:14 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id TAA23570; + Wed, 14 Oct 1998 19:49:32 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTSdF-000B5AC; Wed, 14 Oct 98 17:14 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for lockhart@alumni.caltech.edu + id m0zTVAt-000EBRC; Wed, 14 Oct 98 19:57 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Wed, 14 Oct 1998 19:57:27 +0200 (MET DST) +Cc: lockhart@alumni.caltech.edu, jwieck@debis.com, eric@linux-hw.com, + jeff@remapcorp.com, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810141721.NAA28746@candle.pha.pa.us> from "Bruce Momjian" at Oct 14, 98 01:21:15 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: RO + +> But, you are correct that certain cases where in index is already being +> used on a query, you could just skip the sort IF you used the index to +> get the rows from the base table. + + Especially in the case where + + SELECT ... WHERE key > 'val' ORDER BY key; + + creates a Sort->IndexScan plan. The index scan already jumps + around on the disc to collect the sorts input and the sort + finally returns exactly the same output (if the used index is + only on key). + + And this is the case for large tables. The planner first + decides to use an index scan due to the WHERE clause and + later it notices the ORDER BY clause and creates a sort over + the scan. + + I'm actually hacking around on it to see what happens if I + suppress the sort node in some cases. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + +From owner-pgsql-hackers@hub.org Wed Oct 14 16:31:07 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id QAA01119 + for ; Wed, 14 Oct 1998 16:31:05 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id PAA22534 for ; Wed, 14 Oct 1998 15:29:50 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id PAA26335; + Wed, 14 Oct 1998 15:05:26 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 15:02:13 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id PAA26013 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 15:02:11 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from candle.pha.pa.us (root@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.8.8/8.8.8) with ESMTP id PAA25996 + for ; Wed, 14 Oct 1998 15:01:58 -0400 (EDT) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id OAA29639; + Wed, 14 Oct 1998 14:27:05 -0400 (EDT) +From: Bruce Momjian +Message-Id: <199810141827.OAA29639@candle.pha.pa.us> +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: <199810141721.NAA28746@candle.pha.pa.us> from Bruce Momjian at "Oct 14, 1998 1:21:15 pm" +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Wed, 14 Oct 1998 14:27:05 -0400 (EDT) +Cc: lockhart@alumni.caltech.edu, jwieck@debis.com, eric@linux-hw.com, + jeff@remapcorp.com, hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> Thomas is correct on this. Vadim has run some tests, and with our +> optimized psort() code, the in-memory sort is often faster than using +> the index to get the tuple, because you are jumping all over the drive. +> I don't remember, but obviously there is a break-even point where +> getting X rows using the index on a table of Y rows is faster , but +> getting X+1 rows on a table of Y rows is faster getting all the rows +> sequentailly, and doing the sort. +> +> You would have to pick only certain queries(no joins, index matches +> ORDER BY), take the number of rows requested, and the number of rows +> selected, and figure out if it is faster to use the index, or a +> sequential scan and do the ORDER BY yourself. +> +> Add to this the OFFSET capability. I am not sure how you are going to +> get into the index and start at the n-th entry, unless perhaps you just +> sequential scan the index. +> +> In fact, many queries just get column already indexed, and we could just +> pull the data right out of the index. +> +> I have added this to the TODO list: +> +> * Pull requested data directly from indexes, bypassing heap data +> +> I think this has to be post-6.4 work, but I think we need to work in +> this direction. I am holding off any cnfify fixes for post-6.4, but a +> 6.4.1 performance release certainly is possible. +> +> +> But, you are correct that certain cases where in index is already being +> used on a query, you could just skip the sort IF you used the index to +> get the rows from the base table. + +I have had more time to think about this. Basically, for pre-sorted +data, our psort code is very fast, because it does not need to sort +anything. It just moves the rows in and out of the sort memory. Yes, +it could be removed in some cases, and probably should be, but it is not +going to produce great speedups. + +The more general case I will describe below. + +First, let's look at a normal query: + + SELECT * + FROM tab + ORDER BY col1 + +This is not going to use an index, and probably should not because it is +faster for large tables to sort them in memory, rather than moving all +over the disk. For small tables, if the entire table fits in the buffer +cache, it may be faster to use the index, but on a small table the sort +doesn't take very long either, and the buffer cache effectiveness is +affected by other backends using it, so it may be better not to count on +it for a speedup. + +However, if you only want the first 10 rows, that is a different story. +We pull all the rows into the backend, sort them, then return 10 rows. +The query, if we could do it, should be written as: + + SELECT * + FROM tab + WHERE col1 < some_unknown_value + ORDER BY col1 + +In this case, the optimizer looks at the column statistics, and properly +uses an index to pull only a small subset of the table. This is the +type of behavior people want for queries returning only a few values. + +But, unfortunately, we don't know that mystery value. + +Now, everyone agrees we need an index matching the ORDER BY to make this +query quick, but we don't know that mystery value, so currently we +execute the whole query, and do a fetch. + +What I am now thinking is that maybe we need a way to walk around that +index. Someone months ago asked how to do that, and we told him he +couldn't, because this not a C-ISAM/dbm type database. However, if we +could somehow pass into the query the index location we want to start +at, and how many rows we need, that would solve our problem, and perhaps +even allow joined queries to work, assuming the table in the ORDER BY is +in an outer join loop. + + SELECT * + FROM tab + WHERE col1 < some_unknown_value + ORDER BY col1 + USING INDEX tab_idx(452) COUNT 100 + +where 452 is an 452th index entry, and COUNT is the number of index rows +you want to process. The query may return more or less than 100 rows if +there is a join and it joins to zero or more than one row in the joined +table, but this seems like perhaps a good way to go at it. We need to +do it this way because if a single index row returns 4 result rows, and +only two of the four rows fit in the number of rows returnd as set by the +user, it is hard to re-start the query at the proper point, because you +would have to process the index rows a second time, and return just part +of the result, and that is hard. + +If the index changes, or rows are added, the results are going to be +unreliable, but that is probably going to be true of any state-less +implementation we can devise. + +I think this may be fairly easy to implement. We could sequential scan +the index to get to the 452th row. That is going to be quick. We can +pass the 452 into the btree index code, so only a certain range of index +tuples are returned, and the system believes it has processed the entire +query, while we know it hasn't. Doesn't really work with hash, so we +will not allow it for those indexes. + +To make it really easy, we could implement it as a 'SET' command, so we +don't actually have it as part of the query, and have to pass it around +through all the modules. You would do the proper 'SET' before running +the query. Optimizer would look at 'SET' value to force index use. + + SET INDEX TO tab_idx START 452 COUNT 100 + +or + + SET INDEX TO tab_idx FROM 452 COUNT 451 + +There would have to be some way to signal that the end of the index had +been reached, because returning zero rows is not enough of a guarantee +in a joined SELECT. + +Comments? + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + +From owner-pgsql-hackers@hub.org Wed Oct 14 17:31:23 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id RAA01591 + for ; Wed, 14 Oct 1998 17:31:21 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id RAA02744 for ; Wed, 14 Oct 1998 17:26:53 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id RAA05601; + Wed, 14 Oct 1998 17:03:21 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 16:59:54 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id QAA04964 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 16:59:52 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id QAA04943 + for ; Wed, 14 Oct 1998 16:59:28 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id WAA28383; Wed, 14 Oct 1998 22:57:42 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma028354; Wed, 14 Oct 98 22:57:28 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id WAA20547; + Wed, 14 Oct 1998 22:54:51 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id WAA24383; + Wed, 14 Oct 1998 22:57:09 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTVYr-000B5AC; Wed, 14 Oct 98 20:22 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for lockhart@alumni.caltech.edu + id m0zTY6V-000EBRC; Wed, 14 Oct 98 23:05 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Wed, 14 Oct 1998 23:05:07 +0200 (MET DST) +Cc: maillist@candle.pha.pa.us, lockhart@alumni.caltech.edu, jwieck@debis.com, + eric@linux-hw.com, jeff@remapcorp.com, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810141827.OAA29639@candle.pha.pa.us> from "Bruce Momjian" at Oct 14, 98 02:27:05 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> I have had more time to think about this. Basically, for pre-sorted +> data, our psort code is very fast, because it does not need to sort +> anything. It just moves the rows in and out of the sort memory. Yes, +> it could be removed in some cases, and probably should be, but it is not +> going to produce great speedups. + + And I got the time to hack around about this. + + I hacked in a little check into the planner, that compares + the sortClause against the key field list of an index scan + and just suppresses the sort node if it exactly matchs and + all sort operators are "<". + + I tested with a 10k row table where key is a text field. The + base query is a + + SELECT ... WHERE key > 'val' ORDER BY key; + + The used 'val' is always a key that is close to the first of + all keys in the table ('' on the first query and the last + selected value on subsequent ones). + + Scenario 1 (S1) uses exactly the above query but processes + only the first 20 rows from the result buffer. Thus the + frontend receives nearly the whole table. + + Scenario 2 (S2) uses a cursor and FETCH 20. But closes the + cursor and creates a new one for the next selection (only + with another 'val') as it would occur in a web application. + + If there is no index on key, the backend will allways do a + Sort->SeqScan and due to the 'val' close to the lowest + existing key nearly all tuples get scanned and put into the + sort. S1 here runs about 10 seconds and S2 about 6 seconds. + The speedup in S2 results from the reduced overhead of + sending not wanted tuples into the frontend. + + Now with a btree index on key and an unpatched backend. + Produced plan is always a Sort->IndexScan. S1 needs 16 + seconds and S2 needs 12 seconds. Again nearly all data is put + into the sort but this time over the index scan and that is + slower. + + Last with the btree index on key and the patched backend. + This time the plan is a plain IndexScan because the ORDER BY + clause exactly matches the sort order of the choosen index. + S1 needs 13 seconds and S2 less than 0.2! This dramatic + speedup comes from the fact, that this time the index scan is + the toplevel executor node and the executor run is stopped + after 20 tuples have been selected. + + Analysis of the above timings: + + If there is an ORDER BY clause, using an index scan is the + clever way if the indexqual dramatically reduces the the + amount of data selected and sorted. I think this is the + normal case (who really selects nearly all rows from a 5M row + table?). So choosing the index path is correct. This will + hurt if someone really selects most of the rows and the index + scan jumps over the disc. But here the programmer should use + an unqualified query to perform a seqscan and do the + qualification in the frontend application. + + The speedup for the cursor/fetch scenario is so impressive + that I'll create a post 6.4 patch. I don't want it in 6.4 + because there is absolutely no query in the whole regression + test, where it suppresses the sort node. So we have + absolutely no check that it doesn't break anything. + + For a web application, that can use a unique key to select + the next amount of rows, it will be a big win. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Thu Oct 15 00:01:10 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id AAA06040 + for ; Thu, 15 Oct 1998 00:01:04 -0400 (EDT) +Received: from hub.org (root@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id XAA29020 for ; Wed, 14 Oct 1998 23:57:58 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id WAA02215; + Wed, 14 Oct 1998 22:39:07 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 22:35:19 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id WAA02061 + for pgsql-hackers-outgoing; Wed, 14 Oct 1998 22:35:16 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from sraigw.sra.co.jp (sraigw.sra.co.jp [202.32.10.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id WAA01851 + for ; Wed, 14 Oct 1998 22:35:01 -0400 (EDT) + (envelope-from t-ishii@srapc451.sra.co.jp) +Received: from srapc451.sra.co.jp (srapc451 [133.137.44.37]) + by sraigw.sra.co.jp (8.8.7/3.6Wbeta7-sraigw) with ESMTP id LAA17765; + Thu, 15 Oct 1998 11:34:39 +0900 (JST) +Received: from srapc451.sra.co.jp (localhost [127.0.0.1]) by srapc451.sra.co.jp (8.8.8/3.5Wpl7) with ESMTP id LAA08260; Thu, 15 Oct 1998 11:34:54 +0900 (JST) +Message-Id: <199810150234.LAA08260@srapc451.sra.co.jp> +To: jwieck@debis.com (Jan Wieck) +cc: oleg@sai.msu.su, hackers@postgreSQL.org, t-ishii@sra.co.jp +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +From: Tatsuo Ishii +Reply-To: t-ishii@sra.co.jp +In-reply-to: Your message of Wed, 14 Oct 1998 16:24:56 +0200. + +Date: Thu, 15 Oct 1998 11:34:54 +0900 +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +>> postgres just because of lacking LIMIT. Tatsuo posted a patch +>> for set query_limit to 'num', I just tested it and seems it +>> works fine. Now, we need only possibility to specify offset, +>> say +>> set query_limit to 'offset,num' +>> ( Tatsuo, How difficult to do this ?) +>> and LIMIT problem will ne gone. +> +> Think you haven't read my posting completely. Even with the +> executor limit, the complete scan into the sort is done by +> the backend. You need to specify ORDER BY to get the same +> list again (without the offset doesn't make sense). But +> currently, ORDER BY forces a sort node into the query plan. + +I think we have understanded your point. set query_limit is just a +easy alternative of using cursor and fetch. + +> I haven't looked at Tatsuo's patch very well. But if it +> limits the amount of data going into the sort (on ORDER BY), +> it will break it! The requested ordering could be different +> from what the choosen index might return. The used index is +> choosen by the planner upon the qualifications given, not the +> ordering wanted. + +I think it limits the final result. When query_limit is set, +the arg "numberTuples" of ExecutePlan() is set to it instead of 0 +(this means no limit). + +Talking about "offset," it shouldn't be very difficult. I guess all we +have to do is adding a new arg "offset" to ExecutePlan() then making +obvious modifications. (and of course we have to modify set +query_limit syntax but it's trivial) + +However, before going ahead, I would like to ask other hackers about +this direction. This might be convenient for some users, but still the +essential performance issue would remain. In another word, this is a +short-term solution not a intrinsic one, IMHO. +-- +Tatsuo Ishii +t-ishii@sra.co.jp + + +From owner-pgsql-hackers@hub.org Thu Oct 15 10:01:17 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id KAA13960 + for ; Thu, 15 Oct 1998 10:01:15 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id JAA20266 for ; Thu, 15 Oct 1998 09:12:21 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id IAA26142; + Thu, 15 Oct 1998 08:19:49 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 15 Oct 1998 08:13:48 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id IAA25747 + for pgsql-hackers-outgoing; Thu, 15 Oct 1998 08:13:46 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id IAA25733 + for ; Thu, 15 Oct 1998 08:13:40 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id OAA18677; Thu, 15 Oct 1998 14:16:12 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma018279; Thu, 15 Oct 98 14:15:39 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id OAA01227; + Thu, 15 Oct 1998 14:13:09 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id OAA28938; + Thu, 15 Oct 1998 14:15:27 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zTjtm-000B5AC; Thu, 15 Oct 98 11:40 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for jwieck@debis.com + id m0zTmRT-000EBRC; Thu, 15 Oct 98 14:23 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: t-ishii@sra.co.jp +Date: Thu, 15 Oct 1998 14:23:43 +0200 (MET DST) +Cc: jwieck@debis.com, oleg@sai.msu.su, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810150234.LAA08260@srapc451.sra.co.jp> from "Tatsuo Ishii" at Oct 15, 98 11:34:54 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Tatsuo Ishii wrote: + +> I think we have understanded your point. set query_limit is just a +> easy alternative of using cursor and fetch. +> +> > I haven't looked at Tatsuo's patch very well. But if it +> > limits the amount of data going into the sort (on ORDER BY), +> > it will break it! The requested ordering could be different +> > from what the choosen index might return. The used index is +> > choosen by the planner upon the qualifications given, not the +> > ordering wanted. +> +> I think it limits the final result. When query_limit is set, +> the arg "numberTuples" of ExecutePlan() is set to it instead of 0 +> (this means no limit). +> +> Talking about "offset," it shouldn't be very difficult. I guess all we +> have to do is adding a new arg "offset" to ExecutePlan() then making +> obvious modifications. (and of course we have to modify set +> query_limit syntax but it's trivial) + + The offset could become + + FETCH n IN cursor [OFFSET n]; + + and + + SELECT ... [LIMIT offset,count]; + + The FETCH command already calls ExecutorRun() with the given + count (the tuple limit). Telling it the offset too is really + simple. And ExecutorRun() could check if the toplevel + executor node is an index scan. Skipping tuples during the + index scan requires, that all qualifications are in the + indexqual, thus any tuple returned by it will become a final + result row (as it would be in the simple 1-table-queries we + discussed). If that isn't the case, the executor must + fallback to skip the final result tuples and that is after an + eventually processed sort/merge of the complete result set. + That would only reduce communication to the client and memory + required there to buffer the result set (not a bad thing + either). + + ProcessQueryDesc() in tcop/pquery.c also calls ExecutorRun() + but with a constant 0 tuple count. Having offset and count in + the parsetree would make it without any state variables or + SET command. And it's the only clean way to restrict LIMIT to + SELECT queries. Any thrown in LIMIT to ExecutorRun() from + another place could badly hurt the rewrite system. Remember + that non-instead actions on insert/update/delete are + processed before the original query! And what about SQL + functions that get processed during the evaluation of another + query (view using an SQL function for count(*))? + + A little better would it be to make the LIMIT values able to + be parameter nodes. C or PL functions use the prepared plan + feature of the SPI manager for performance reasons. + Especially the offset value might there need to be a + parameter that the executor has to pick out first. If we + change the count argument of ExecutorRun to a List *limit, + this one could be NIL (to mean the old 0 count 0 offset + behaviour) or a list of two elements that both can be either + a Const or a Param of type int4. Easy for the executor to + evaluate. + + The only places where ExecutorRun() is called are + tcop/pquery.c (queries from frontend), commands/command.c + (FETCH command), executor/functions.c (SQL functions) and + executor/spi.c (SPI manager). So it is easy to change the + call interface too. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Thu Oct 15 14:32:34 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id OAA19803 + for ; Thu, 15 Oct 1998 14:32:31 -0400 (EDT) +Received: from hub.org (root@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA10847 for ; Thu, 15 Oct 1998 13:38:16 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id MAA22772; + Thu, 15 Oct 1998 12:07:20 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 15 Oct 1998 12:02:33 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id MAA22026 + for pgsql-hackers-outgoing; Thu, 15 Oct 1998 12:02:31 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from ra.sai.msu.su (ra.sai.msu.su [158.250.29.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id MAA22007 + for ; Thu, 15 Oct 1998 12:02:16 -0400 (EDT) + (envelope-from oleg@sai.msu.su) +Received: from ra (ra [158.250.29.2]) + by ra.sai.msu.su (8.9.1/8.9.1) with SMTP id TAA21024; + Thu, 15 Oct 1998 19:01:23 +0300 (MSK) +Date: Thu, 15 Oct 1998 20:01:23 +0400 (MSD) +From: Oleg Bartunov +X-Sender: megera@ra +To: Jan Wieck +cc: t-ishii@sra.co.jp, hackers@postgreSQL.org +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: +Message-ID: +Organization: Sternberg Astronomical Institute (Moscow University) +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +This is a little bit off-topic, +I did some timings with latest cvs on my real database +( all output redirected to /dev/null ), table contains 8798 records, +31 columns, order key have indices. + +1.select count(*) from work_flats; +0.02user 0.00system 0:00.18elapsed 10%CPU (0avgtext+0avgdata 0maxresident)k +0inputs+0outputs (131major+21minor)pagefaults 0swaps + +2.select * from work_flats order by rooms, metro_id; +2.35user 0.25system 0:10.11elapsed 25%CPU (0avgtext+0avgdata 0maxresident)k +0inputs+0outputs (131major+2799minor)pagefaults 0swaps + +3.set query_limit to '150'; +SET VARIABLE +select * from work_flats order by rooms, metro_id; +0.06user 0.00system 0:02.75elapsed 2%CPU (0avgtext+0avgdata 0maxresident)k +0inputs+0outputs (131major+67minor)pagefaults 0swaps + +4.begin; +declare tt cursor for +select * from work_flats order by rooms, metro_id; +fetch 150 in tt; +end; +0.05user 0.01system 0:02.76elapsed 2%CPU (0avgtext+0avgdata 0maxresident)k +0inputs+0outputs (131major+67minor)pagefaults 0swaps + +As you can see timings for query_limit and cursor are very similar, +I didn't expected this. So, in principle, enhanced version of fetch +(with offset) would cover all we need from LIMIT, but query_limit would be +still useful, for example to restrict loadness of server. +Will all enhancements you discussed go to the 6.4 ? +I'm really interested in testing this stuff because I begin new project +and everything we discussed here are badly needed. + + + Regards, + + Oleg + + + +On Thu, 15 Oct 1998, Jan Wieck wrote: + +> Date: Thu, 15 Oct 1998 14:23:43 +0200 (MET DST) +> From: Jan Wieck +> To: t-ishii@sra.co.jp +> Cc: jwieck@debis.com, oleg@sai.msu.su, hackers@postgreSQL.org +> Subject: Re: [HACKERS] What about LIMIT in SELECT ? +> +> Tatsuo Ishii wrote: +> +> > I think we have understanded your point. set query_limit is just a +> > easy alternative of using cursor and fetch. +> > +> > > I haven't looked at Tatsuo's patch very well. But if it +> > > limits the amount of data going into the sort (on ORDER BY), +> > > it will break it! The requested ordering could be different +> > > from what the choosen index might return. The used index is +> > > choosen by the planner upon the qualifications given, not the +> > > ordering wanted. +> > +> > I think it limits the final result. When query_limit is set, +> > the arg "numberTuples" of ExecutePlan() is set to it instead of 0 +> > (this means no limit). +> > +> > Talking about "offset," it shouldn't be very difficult. I guess all we +> > have to do is adding a new arg "offset" to ExecutePlan() then making +> > obvious modifications. (and of course we have to modify set +> > query_limit syntax but it's trivial) +> +> The offset could become +> +> FETCH n IN cursor [OFFSET n]; +> +> and +> +> SELECT ... [LIMIT offset,count]; +> +> The FETCH command already calls ExecutorRun() with the given +> count (the tuple limit). Telling it the offset too is really +> simple. And ExecutorRun() could check if the toplevel +> executor node is an index scan. Skipping tuples during the +> index scan requires, that all qualifications are in the +> indexqual, thus any tuple returned by it will become a final +> result row (as it would be in the simple 1-table-queries we +> discussed). If that isn't the case, the executor must +> fallback to skip the final result tuples and that is after an +> eventually processed sort/merge of the complete result set. +> That would only reduce communication to the client and memory +> required there to buffer the result set (not a bad thing +> either). +> +> ProcessQueryDesc() in tcop/pquery.c also calls ExecutorRun() +> but with a constant 0 tuple count. Having offset and count in +> the parsetree would make it without any state variables or +> SET command. And it's the only clean way to restrict LIMIT to +> SELECT queries. Any thrown in LIMIT to ExecutorRun() from +> another place could badly hurt the rewrite system. Remember +> that non-instead actions on insert/update/delete are +> processed before the original query! And what about SQL +> functions that get processed during the evaluation of another +> query (view using an SQL function for count(*))? +> +> A little better would it be to make the LIMIT values able to +> be parameter nodes. C or PL functions use the prepared plan +> feature of the SPI manager for performance reasons. +> Especially the offset value might there need to be a +> parameter that the executor has to pick out first. If we +> change the count argument of ExecutorRun to a List *limit, +> this one could be NIL (to mean the old 0 count 0 offset +> behaviour) or a list of two elements that both can be either +> a Const or a Param of type int4. Easy for the executor to +> evaluate. +> +> The only places where ExecutorRun() is called are +> tcop/pquery.c (queries from frontend), commands/command.c +> (FETCH command), executor/functions.c (SQL functions) and +> executor/spi.c (SPI manager). So it is easy to change the +> call interface too. +> +> +> Jan +> +> -- +> +> #======================================================================# +> # It's easier to get forgiveness for being wrong than for being right. # +> # Let's break this rule - forgive me. # +> #======================================== jwieck@debis.com (Jan Wieck) # +> +> + +_____________________________________________________________ +Oleg Bartunov, sci.researcher, hostmaster of AstroNet, +Sternberg Astronomical Institute, Moscow University (Russia) +Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/ +phone: +007(095)939-16-83, +007(095)939-23-83 + + + +From owner-pgsql-hackers@hub.org Thu Oct 15 13:22:48 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA18540 + for ; Thu, 15 Oct 1998 13:22:46 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id MAA01819; + Thu, 15 Oct 1998 12:56:25 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 15 Oct 1998 12:51:43 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id MAA01305 + for pgsql-hackers-outgoing; Thu, 15 Oct 1998 12:51:40 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id MAA01283 + for ; Thu, 15 Oct 1998 12:51:28 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id SAA21874; Thu, 15 Oct 1998 18:54:00 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma021705; Thu, 15 Oct 98 18:53:31 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id SAA25226; + Thu, 15 Oct 1998 18:50:57 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id SAA30639; + Thu, 15 Oct 1998 18:53:14 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zToEf-000B5AC; Thu, 15 Oct 98 16:18 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for jwieck@debis.com + id m0zTqmM-000EBRC; Thu, 15 Oct 98 19:01 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: hannu@trust.ee (Hannu Krosing) +Date: Thu, 15 Oct 1998 19:01:33 +0200 (MET DST) +Cc: jwieck@debis.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <36261DF7.D20368A0@trust.ee> from "Hannu Krosing" at Oct 15, 98 07:08:23 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Hannu Krosing wrote: + +> Jan Wieck wrote: +> > The speedup for the cursor/fetch scenario is so impressive +> > that I'll create a post 6.4 patch. I don't want it in 6.4 +> > because there is absolutely no query in the whole regression +> > test, where it suppresses the sort node. +> +> Good, then it works as expected ;) +> +> More seriously, it is not within powers of current regression test +> framework to test speed improvements (only the case where +> performance-wise bad implementation will actually crash the backend, +> as in the cnfify problem, but AFAIK we dont test for those now) +> +> > So we have absolutely no check that it doesn't break anything. +> +> If it did pass the regression, then IMHO it did not break anything. + + Thats the point. The check if the sort node is required + returns TRUE for ALL queries of the regression. So the + behaviour when it returns FALSE is absolutely not tested. + +> +> I would vote for putting it in (maybe with a +> 'set fix_optimiser_stupidity on' safeguard to enable it). I see no +> reason to postpone it to 6.4.1 and force almost everybody to first +> patch their copy and then upgrade very soon. +> +> I would even go far enough to call it a bugfix, as it does not really +> introduce any new functionality only fixes some existing functionality +> so that much bigger databases can be actually used. + + I can't call it a bugfix because it is only a performance win + in some situations. And I feel the risk is too high to put + untested code into the backend at BETA2 time. The max we + should do is to take this one and the LIMIT thing (maybe + implemented as I suggested lately), and put out a Web- + Performance-Release at the same time we release 6.4. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From Inoue@tpf.co.jp Thu Oct 15 20:31:01 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id UAA26050 + for ; Thu, 15 Oct 1998 20:31:00 -0400 (EDT) +Received: from sd.tpf.co.jp (sd.tpf.co.jp [210.161.239.34]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id UAA12888 for ; Thu, 15 Oct 1998 20:10:03 -0400 (EDT) +Received: from cadzone ([126.0.1.40]) + by sd.tpf.co.jp (2.0 Build 2131 (Berkeley 8.8.4)/8.8.4) with SMTP + id JAA02574; Fri, 16 Oct 1998 09:00:34 +0900 +From: "Hiroshi Inoue" +To: "Jan Wieck" , + "Bruce Momjian" +Subject: RE: [HACKERS] What about LIMIT in SELECT ? +Date: Fri, 16 Oct 1998 09:12:55 +0900 +Message-ID: <000201bdf899$b953bf00$2801007e@cadzone.tpf.co.jp> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +In-Reply-To: <199810150552.BAA07576@candle.pha.pa.us> +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.2106.4 +Status: ROr + +> -----Original Message----- +> From: owner-pgsql-hackers@postgreSQL.org +> [mailto:owner-pgsql-hackers@postgreSQL.org]On Behalf Of Bruce Momjian +> Sent: Thursday, October 15, 1998 2:52 PM +> To: jwieck@debis.com +> Cc: lockhart@alumni.caltech.edu; jwieck@debis.com; eric@linux-hw.com; +> jeff@remapcorp.com; hackers@postgreSQL.org +> Subject: Re: [HACKERS] What about LIMIT in SELECT ? +> +> +> > > I have had more time to think about this. Basically, for pre-sorted +> > > data, our psort code is very fast, because it does not need to sort +> > > anything. It just moves the rows in and out of the sort memory. Yes, +> > > it could be removed in some cases, and probably should be, +> but it is not +> > > going to produce great speedups. +> > +> > And I got the time to hack around about this. +> > +> > I hacked in a little check into the planner, that compares +> > the sortClause against the key field list of an index scan +> > and just suppresses the sort node if it exactly matchs and +> > all sort operators are "<". +> > +> > I tested with a 10k row table where key is a text field. The +> > base query is a +> > +> > SELECT ... WHERE key > 'val' ORDER BY key; +> > +> > The used 'val' is always a key that is close to the first of +> > all keys in the table ('' on the first query and the last +> > selected value on subsequent ones). +> +> This is good stuff. I want to think about it for a day. Sounds very +> promising. +> + +Did you see my contribution about this subject ? +I have already implemented above cases and used on trial for three +months or more. +It is good to be formally supported by PostgreSQL community. + +And please remember that there are descending order cases. +(Moreover there are compound cases such as + SELECT * from ... order by key1 desc,key2 asc; + I didn't implement such cases.) + +Thanks. + +Hiroshi Inoue +Inoue@tpf.co.jp + + +From owner-pgsql-hackers@hub.org Fri Oct 16 04:01:07 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id EAA02029 + for ; Fri, 16 Oct 1998 04:01:04 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id DAA05509 for ; Fri, 16 Oct 1998 03:43:53 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id CAA11278; + Fri, 16 Oct 1998 02:00:01 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 16 Oct 1998 01:57:25 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id BAA11129 + for pgsql-hackers-outgoing; Fri, 16 Oct 1998 01:57:21 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from candle.pha.pa.us (root@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.8.8/8.8.8) with ESMTP id BAA11116 + for ; Fri, 16 Oct 1998 01:57:00 -0400 (EDT) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id BAA29942; + Fri, 16 Oct 1998 01:34:33 -0400 (EDT) +From: Bruce Momjian +Message-Id: <199810160534.BAA29942@candle.pha.pa.us> +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: from Jan Wieck at "Oct 14, 1998 11: 5: 7 pm" +To: jwieck@debis.com +Date: Fri, 16 Oct 1998 01:34:33 -0400 (EDT) +Cc: lockhart@alumni.caltech.edu, jwieck@debis.com, eric@linux-hw.com, + jeff@remapcorp.com, hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +OK, I have had my day of thinking, and will address this specific +posting first, because it is the most fundamental concerning the future +direction of the optimization. + +> +> And I got the time to hack around about this. +> +> I hacked in a little check into the planner, that compares +> the sortClause against the key field list of an index scan +> and just suppresses the sort node if it exactly matchs and +> all sort operators are "<". +> +> I tested with a 10k row table where key is a text field. The +> base query is a +> +> SELECT ... WHERE key > 'val' ORDER BY key; +> +> The used 'val' is always a key that is close to the first of +> all keys in the table ('' on the first query and the last +> selected value on subsequent ones). +> +> Scenario 1 (S1) uses exactly the above query but processes +> only the first 20 rows from the result buffer. Thus the +> frontend receives nearly the whole table. + +OK. + +> +> Scenario 2 (S2) uses a cursor and FETCH 20. But closes the +> cursor and creates a new one for the next selection (only +> with another 'val') as it would occur in a web application. +> +> If there is no index on key, the backend will allways do a +> Sort->SeqScan and due to the 'val' close to the lowest +> existing key nearly all tuples get scanned and put into the +> sort. S1 here runs about 10 seconds and S2 about 6 seconds. +> The speedup in S2 results from the reduced overhead of +> sending not wanted tuples into the frontend. + +Makes sense. All rows are processed, but not sent to client. + +> +> Now with a btree index on key and an unpatched backend. +> Produced plan is always a Sort->IndexScan. S1 needs 16 +> seconds and S2 needs 12 seconds. Again nearly all data is put +> into the sort but this time over the index scan and that is +> slower. + +VACUUM ANALYZE could affect this. Because it had no stats, it thought +index use would be faster, but in fact because 'val' was near the lowest +value, it as selecting 90% of the table, and would have been better with +a sequential scan. pg_statistics's low/hi values for a column could +have told that to the optimizer. + +I know the good part of the posting is coming. + +> Last with the btree index on key and the patched backend. +> This time the plan is a plain IndexScan because the ORDER BY +> clause exactly matches the sort order of the chosen index. +> S1 needs 13 seconds and S2 less than 0.2! This dramatic +> speedup comes from the fact, that this time the index scan is +> the toplevel executor node and the executor run is stopped +> after 20 tuples have been selected. + +OK, seems like in the S1 case, the use of the psort/ORDER BY code on top +of the index was taking and extra 3 seconds, which is 23%. That is a +lot more than I thought for the psort code, and shows we could gain a +lot by removing unneeded sorts from queries that are already using +matching indexes. + +Just for clarity, added to TODO. I think everyone is clear on this one, +and its magnitude is a surprise to me: + + * Prevent psort() usage when query already using index matching ORDER BY + + +> Analysis of the above timings: +> +> If there is an ORDER BY clause, using an index scan is the +> clever way if the indexqual dramatically reduces the the +> amount of data selected and sorted. I think this is the +> normal case (who really selects nearly all rows from a 5M row +> table?). So choosing the index path is correct. This will +> hurt if someone really selects most of the rows and the index +> scan jumps over the disc. But here the programmer should use +> an unqualified query to perform a seqscan and do the +> qualification in the frontend application. + +Fortunately, the optimizer already does the index selection for us, and +guesses pretty well if the index or sequential scan is better. Once we +implement the above removal of psort(), we will have to change the +timings because now you have to compare index scan against sequential +scan AND psort(), because in the index scan situation, you don't need +the psort(), assuming the ORDER BY matches the index exactly. + +> The speedup for the cursor/fetch scenario is so impressive +> that I'll create a post 6.4 patch. I don't want it in 6.4 +> because there is absolutely no query in the whole regression +> test, where it suppresses the sort node. So we have +> absolutely no check that it doesn't break anything. +> +> For a web application, that can use a unique key to select +> the next amount of rows, it will be a big win. + +OK, I think the reason the regression test did not show your code being +used is important. + +First, most of the tables are small in the regression test, so sequential +scans are faster. Second, most queries using indexes are either joins, +which do the entire table, or equality tests, like col = 3, where there +is no matching ORDER BY because all the col values are 3. Again, your +code can't help with these. + +The only regression-type code that would use it would be a 'col > 3' +qualification with a col ORDER BY, and there aren't many of those. + +However, if we think of the actual application you are addressing, it is +a major win. If we are going after only one row of the index, it is +fast. If we are going after the entire table, it is faster to +sequential scan and psort(). You big win is with the partial queries, +where you end up doing a full sequential scan or index scan, then and +ORDER BY, while you really only need a few rows from the query, and if +you deal directly with the index, you can prevent many rows from being +processed. It is the ability to skip processing those extra rows that +makes it a big win, not so much the removal of the ORDER BY, though that +helps too. + +Your solution really is tailored for this 'partial' query application, +and I think it is a big need for certain applications that can't use +cursors, like web apps. Most other apps have long-time connections to +the database, and are better off with cursors. + +I did profiling to improve startup time, because the database +requirements of web apps are different from normal db apps, and we have +to adjust to that. + +So, to reiterate, full queries are not benefited as much from the new +code, because sequential scan/psort is faster, or because the index only +retrieves a small number of rows because the qualification of values is +very specific. + +Those open-ended, give me the rows from 100 to 199 really need your +modifications. + +OK, we have QUERY_LIMIT, and that allows us to throw any query at the +system, and it will return that many of the first rows for the ORDER BY. +No fancy stuff required. If we can get a matching index, we may be able +to remove the requirement of scanning all the row (with Jan's patch), +and that is a big win. If not, we at least prevent the rows from being +returned to the client. + +However, there is the OFFSET issue. This is really a case where the +user wants to _restart_ the query where they left off. That is a +different problem. All of a sudden, we need to evaluate more of the +query, and return a segment from the middle of the result set. + +I think we need to decide how to handle such a restart. Do we +re-evaluate the entire query, skipping all the rows up to OFFSET, and +return the number of rows they requested after OFFSET. I would think we +don't want to do that, do we. It would be much easier to code. If it +is a single table, skipping forward has to be done anyway, because we +can't just _jump_ to the 100th entry in the index, unless we pass some +_tid_ to the user, and expect them to pass that back to start the query. +I don't think we went to do that. It is ugly, and the row may have +moved since we started. So, for a single table, adding a QUERY_OFFSET +would do exactly what we need, with Jan's patches. + +For a joined query, I think you will have to do the entire _join_ before +returning anything. + +You can't just process all the joins up to the OFFSET location, and you +can't just jump to the 100th index location, because you don't know that +the 100th index location produced the 100th result just returned to the +user. You have to process the whole query, and because of the join and +not knowing which data row from each table is going to make which entry +in the final result. If you are really craft, and the ORDER BY table is +in the outer part of the join loop, you could start processing the table +that is part of the outer loop in _index_ order, because you know that +the rows processed in index order are going to produce the output in +result order. You then could process and throw away the results up to +offset, and generate the needed rows and stop. + +The other way of doing it is to specify a query limit based on specific +index entries, so you say I want the query returned by the first 20 +index entries matching the ORDER BY, or entries 100-199, and the query +is limited to using only those entries in the index. In that case, +though, in joins, you could return more or less rows in the result +depending on the other tables, and that may be unacceptable. However, +for this case, the advantage is that you don't need to process the rows +from 1 to 99 because you have been told the user only wants rows from +certain index slots. If the user requests rows 50000-50100, this would +be much faster because you don't have to process the 50000 rows before +returning any data. However, I question how often people grab stuff +from the center of large data sets. Seems the QUERY_OFFSET idea may be +easier for users. + +I will be commenting on the rest of the optimization postings tomorrow. + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + +From Inoue@tpf.co.jp Fri Oct 16 03:31:02 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id DAA01767 + for ; Fri, 16 Oct 1998 03:31:00 -0400 (EDT) +Received: from sd.tpf.co.jp (sd.tpf.co.jp [210.161.239.34]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id DAA04551 for ; Fri, 16 Oct 1998 03:13:40 -0400 (EDT) +Received: from cadzone ([126.0.1.40]) + by sd.tpf.co.jp (2.0 Build 2131 (Berkeley 8.8.4)/8.8.4) with SMTP + id QAA02680; Fri, 16 Oct 1998 16:04:09 +0900 +From: "Hiroshi Inoue" +To: "Bruce Momjian" +Cc: +Subject: RE: [HACKERS] What about LIMIT in SELECT ? +Date: Fri, 16 Oct 1998 16:16:29 +0900 +Message-ID: <000001bdf8d4$e4cdf520$2801007e@cadzone.tpf.co.jp> +MIME-Version: 1.0 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +X-Priority: 3 (Normal) +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook 8.5, Build 4.71.2173.0 +Importance: Normal +X-MimeOLE: Produced By Microsoft MimeOLE V4.72.2106.4 +In-Reply-To: <199810160621.CAA01030@candle.pha.pa.us> +Status: RO + +Where's my contibution to hackers@potsgreSQL.org ? +I will resend it. + +> -----Original Message----- +> From: Bruce Momjian [mailto:maillist@candle.pha.pa.us] +> Sent: Friday, October 16, 1998 3:22 PM +> To: Hiroshi Inoue +> Cc: jwieck@debis.com +> Subject: Re: [HACKERS] What about LIMIT in SELECT ? +> +> +> [Charset iso-8859-1 unsupported, filtering to ASCII...] +> > > > The used 'val' is always a key that is close to the first of +> > > > all keys in the table ('' on the first query and the last +> > > > selected value on subsequent ones). +> > > +> > > This is good stuff. I want to think about it for a day. Sounds very +> > > promising. +> > > +> > +> > Did you see my contribution about this subject ? +> +> I am sorry. I have not seen it, and I am confused how I could have +> missed it. +> +> > I have already implemented above cases and used on trial for three +> > months or more. +> > It is good to be formally supported by PostgreSQL community. +> > +> > And please remember that there are descending order cases. +> > (Moreover there are compound cases such as +> > SELECT * from ... order by key1 desc,key2 asc; +> > I didn't implement such cases.) +> +> Where is the discussion of this? I am confused. You have been using +> code for three months that does this? +> + +Hi all. +I didn't follow all the posts about this thread. +So this post may be out of center. + +I think current PostgreSQL lacks the concern to the response to get first +rows quickly. +For example,queries with ORDER BY clause necessarily include sort steps +and process all target rows to get first rows only. +So I modified my code for ORDER BY cases and use on trial. +I don't understand PostgreSQL sources,so my code is not complete. + +I modified my code for the following 2 cases. + +1.In many cases the following query uses index scan. + SELECT * from ... where key > ...; (where (key) is an index) + If so,we can omit sort steps from the access plan for the following + query. + SELECT * from ... where key > ... order by key; + + Currently cursors without sort steps may be sensitive diffrent from + cursors with sort steps. But no one mind it. + +2.In many cases the following query uses index scan same as case 1. + SELECT * from ... where key < ...;(where (key) is an index) + If so and if we scan the index backward,we can omit sort steps from + the access plan for the following query. + SELECT * from ... where key < ... order by key desc; + + To achive this(backward scan),I used hidden(provided for the future ?)code + that is never executed and is not necessarily correct. + +In the following cases I didn't modify my code to use index scan, +because I couldn't formulate how to tell PostgreSQL optimizer whether +the response to get first rows is needed or the throughput to process +sufficiently many target rows is needed. + +3.The access plan made by current PostgreSQL optimizer for a query with + ORDER BY clause doesn't include index scan. + +I thought the use of Tatsuo's QUERY_LIMIT to decide that the responce +is needed. It is sufficient but not necessary ? +In Oracle the hints FIRST_ROWS,ALL_ROWS are used. + +Thanks. + +Hiroshi Inoue +Inoue@tpf.co.jp + + +From wieck@sapserv.debis.de Fri Oct 16 05:01:03 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id FAA02500 + for ; Fri, 16 Oct 1998 05:01:02 -0400 (EDT) +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id EAA06270 for ; Fri, 16 Oct 1998 04:13:59 -0400 (EDT) +Received: by dsh.de; id KAA11635; Fri, 16 Oct 1998 10:12:45 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma011343; Fri, 16 Oct 98 10:12:15 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id KAA21793; + Fri, 16 Oct 1998 10:09:49 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id KAA01799; + Fri, 16 Oct 1998 10:12:11 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zU2aB-000B5AC; Fri, 16 Oct 98 07:37 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zU57w-000EBQC; Fri, 16 Oct 98 10:20 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: Inoue@tpf.co.jp (Hiroshi Inoue) +Date: Fri, 16 Oct 1998 10:20:47 +0200 (MET DST) +Cc: maillist@candle.pha.pa.us, jwieck@debis.com +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <000001bdf8d4$e4cdf520$2801007e@cadzone.tpf.co.jp> from "Hiroshi Inoue" at Oct 16, 98 04:16:29 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: RO + +Hiroshi Inoue wrote: + +> In the following cases I didn't modify my code to use index scan, +> because I couldn't formulate how to tell PostgreSQL optimizer whether +> the response to get first rows is needed or the throughput to process +> sufficiently many target rows is needed. +> +> 3.The access plan made by current PostgreSQL optimizer for a query with +> ORDER BY clause doesn't include index scan. +> +> I thought the use of Tatsuo's QUERY_LIMIT to decide that the responce +> is needed. It is sufficient but not necessary ? +> In Oracle the hints FIRST_ROWS,ALL_ROWS are used. + + I still think that the QUERY LIMIT should be part of the + parse tree and not thrown in by a magic SET command. If + rewriting or function calls turn the one query sent to the + backend into multiple queries processed internal, how should + this QUERY LIMIT variable know to which of all the queries it + has to be applied? It can really break functions and rewrite + rules if this variable is used on all queries while it is + set. + + For your case 3 I think, if there is a QUERY LIMIT in the + parse tree, the (future) optimizer definitely knows that not + all rows will get processed even if there is no qualification + given. So if there is an index, that matches the ORDER BY + clause and it is no a join and the (future) executor handles + OFFSET in single table index scans fast, it could choose an + index scan for this query too. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + +From owner-pgsql-hackers@hub.org Fri Oct 16 12:02:27 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id MAA13063 + for ; Fri, 16 Oct 1998 12:02:23 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA18435 for ; Fri, 16 Oct 1998 12:01:46 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id LAA24469; + Fri, 16 Oct 1998 11:28:54 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 16 Oct 1998 11:25:54 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id LAA24370 + for pgsql-hackers-outgoing; Fri, 16 Oct 1998 11:25:52 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id LAA24356 + for ; Fri, 16 Oct 1998 11:25:34 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id RAA06506; Fri, 16 Oct 1998 17:28:04 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma006149; Fri, 16 Oct 98 17:27:12 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id RAA00811 + for ; Fri, 16 Oct 1998 17:24:37 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id RAA04532 + for ; Fri, 16 Oct 1998 17:26:54 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zU9N0-000B5AC; Fri, 16 Oct 98 14:52 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for pgsql-hackers@postgreSQL.org + id m0zUBum-000EBQC; Fri, 16 Oct 98 17:35 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: [HACKERS] SELECT ... LIMIT (trial implementation) +To: pgsql-hackers@postgreSQL.org (PostgreSQL HACKERS) +Date: Fri, 16 Oct 1998 17:35:39 +0200 (MET DST) +Reply-To: jwieck@debis.com (Jan Wieck) +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Here we go, + + this is up to now only for discussion, do not apply to CVS! + + Those involved into the LIMIT discussion please comment. + + Here is what I had in mind for the SELECT ... LIMIT. It adds + + SELECT ... [LIMIT count [, offset]] + + to the parser and arranges that these values are passed down + to the executor. + + It is a clean implementation of LIMIT (regression tested) and + the open items on it are to enable parameters and handle it + in SQL functions and SPI stuff (currently ignored in both). + Optimizing the executor would require the other sort node + stuff discussion first to come to a conclusion. For now it + skips final result rows - but that's already one step forward + since it reduces the rows sent to the frontend to exactly + that what LIMIT requested. + + I've seen the queryLimit by SET variable stuff and that + really can break rewrite rules, triggers or functions. This + is because the query limit will be inherited by any query + (inserts, updates, deletes too) done by them. Have a rule for + constraint deletes of referencing tuples + + CREATE RULE del_table1 AS ON DELETE TO table1 DO + DELETE FROM table2 WHERE ref = OLD.key; + + If the user now sets the query limit to 1 via SET and deletes + a row from table1, only the first found record in table2 will + be constraint deleted, not all of them. + + This is a feature where users can get around rules that + ensure data integrity. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +begin 644 opt_limit.diff.gz +M'XL(`$]=)S8"`^4\:W?B1K*?R:_H82=98&1;$F^<>`^+F1DV&!S`NXLSV5RO+6X3RX=1&F#W57U4JE6/0%-X&#AG;$3$:Q#`Z]6JG +MVB)&N]WZZN3DY$`?J;9FI][HU&J\;27]88.IMC6H9J^(&E_K!)Y.OB+D;XYG +MNYL%)<5-Y+CAV%S-JUGX066YV7?@4VI9]3['VC5(;1H'CW?$V\.^L +MPKI5/U!**H7"-4/>_TCM3>0'[ZFUOJ(K/W@B8;19+C.I,G1#,_1&0I>AFU#0 +MDI05'"\J%`I+:D6;@)YCR0\;&CQ=TM"&/G^1SZR&]];SO8A^C(C-?\^!E@*4 +MA8#'=5:VO_&BL$YZ`9!%B85##2/B^3#;R\!?D>B>DCOG@7J$#9<\6.Z&\D8YN,[P9T57 +M(8U*WT@R-:)K)'3^0_UE29:5RVQ"Y.MI]+2FA<)W9#9GR7@:4)EU`4[87 +MVK"+V\DN_NL06!KA+JN41S?#8?(6;YG?1OHB5P%1,5SV<&4YWI8&RJC/5D$9 +M@,?KH$.-&QU#[]1;^4JH4=,:[43DP6M3EXLXMKU_)*^$4J +M:_C62)\M":E0!*1LHD63V6;MTIEUZ]*IZT>DPM>6,@SI5K#`"4:V"%P=0(D5 +MA'0&"Z*1WFHQ`YE%_#4-`+\/D&\0$D03\9=+$!"LPU`@P&)OL[JE@2@F4]OR +M+IV`VMB8+.23@&<$EBIK4(K1 +M+5M<-$#5R'E'5;LC;#[WP[`@%V\/#;:6NK,275M1-MGVFN`,QAL,7_A6`O2? +M<,K$>A0*\8(D>KR``XG73@@/8"LCX:N_-KE<@(#XX)RE%'SY)&2K##"4FJJA +MU`01VVS'(G:;M("&&Y=9.F#],(OS$L49RC16B.Q7*)0J6.!X;$3(@4QG,J.+ +M$P026D_*N#G"BJ1QL_,YWH)J`TEME:1V33/K"4E\PD\N:#A?![Y-0;6`?.8NPLV`#?9`^06U\@X;F-)76\(-A?LK?AX&HP([9K;4)A +M>N4@8J87JHN8"<@KT!F@7\I8\XD)'V&`5FP.P684QL9^KJW`6D'=6JUC:-E3 +M^.A$]CTIH7DXL^Z2;LH,O^B@`'8-E59J>O?E?J3R7BBD>(::VA$`V*%!#IVI/P% +M4S=TLZ-("3L[L8>/1CVN0SH""6<]HJY*O%]O]%362 +M,7N\OV?9$2ZA&-ZW1.=SF^XM10+H"*3BEA*/WL'V>:""@N?8J#R:\_BV^BV, +MQS'LX;O$X-YA.Z5J+]<)_^$@TTDAH?*\0JP':@2S86IF(]$(+S)4 +MH:G5(NXNO]#`L^;"^C:;K1>@[,TN9>0%*@IF/.KS_?+2:],`JZJA6E7- +M>DMK-FH*,8=]*%[\6QTI7OP%WE0F8(HN>5+N"Y, +M;\V4",Y1T*BA81U`A#^21]3-J#.]OW`6)8"DBGRQ\`O+?@1FC%.:? +MN.!)NJ%L!KC7?A@Z8.Z3TF*#`5J`7="/)(3I+DNP$_(ORQ,O><2>;5D_?/'( +MA51+0M6?G*C50G=BC-7Q-CPP&T<\,V?VB-@(KC^R1I@7'%$`#D1'%,@O"(_D +MM:YV=*.CF_GQ$5/70;4VI&K%R1`J&`/NT);L%FP?10AR\1+$GEMV$ +M'\#P)O$9238H.XJ1E(KCF#RDZ#$?A90?T&R!RH`\DQ\L(H\B@D?DF2)>!Q1] +M/[8O1'1><"!C>SSK7()?Q_D1A-;-#%AR.NO.IMD,7@,&KQN)AZ2P\_C[>>]F +M,AU/$D;./XX0`H_)R;?HUD7I4X8Y8K0W04#1`U:#7!FAKW,Y#?$>Q:GH75W. +MI_UAOSCZMA#E3^,W'PU@5(#S^#7 +M3ZCDMRV-G=IL9;$#MJLO:GJVOMC?%/2,V:G6\U6&46^V-*/>4CQ)6:0LK$J>N[['ZZFW-;"C) +M&;P@<>ZM]1IZF;(\BH&W]$&0!1J/K9U*P^PP`E_+7(3*JA,194V!?8Q.K;IG3[/C/363!`N:B<<>^=`- +M[%\W7,-CB4O9;\`98PP+()#+F$#%>6QR5' +M=Z6R"7)*$I:'2&-6"@D.+/`.7_Z#]WB +MJ.W*;(;:ALK@)R.;G_:VK()+T3&;>]BII6M&2SU6PX*VKKI[,I0^I2ZUHVFT +MBCHBA@/Z]E6IE)2#'D^B7&50VBRC`O->1-!:!%V2J$L46%ZX](-5@J2T%B9# +M+F(9B^=8MAEH:T#J>!3`3!R2-0ZAZ"D'%,^<)`KR-Y>XWB8(_>!8XM08U$&F +MNPNLU>E3%D?PFKWLQD$R>,WH=L[HW(Q&,Y+KB!.*KL9N1:%O`+_[= +M&?PBUV2G'N+)QW:-9:,=>/;1LJ/MC$2)(D1S*MPJ8@CHQW4@VEA1%``@<.,\LH([&HGR +MK0*3GVW`5"A]P1#CT"NY"_S-.A[5#/CHC@9O07EU@[N0S^77S#?]%N7E10'U +MVIS+T:0*HQ/4\BX*L/?FT(62O&$DMNC_RRGDK;%-R&3"G,F8+Y[8[%,\3+%K +M;F5/Z(8BY@4Z!]QTZ(E&]OW\WG^+@C'I'/ZR%5^RQ)??T:C5<14?/D#/#JO.G'_(ZF +MKL8CF@;HAJ8:H?^5=*<]>:Q<^$18/\5OB^?D6=1?]C,`+A2`LTK_ZGKV<^4L +M"\M91>315,X(GR?A]VVS8Z?`ST5WU0_YN_9WDJF59$\N#SJ4Y"\L."Q1K0Q? +M9OE<]/LKR>O@$"869=["!5XIHUJXHJ(Y@)^GR=SMK5,8L`BW=)OX2O/CF/3& +M8V4QFWDB-2KC'H92FW,)0X'8NH'QVM@&R+A[H=3N7+Q(M=R^=9&JW+YRD<77 +MPO'BD\PRO$9^>J9R,KW`&1.Y5]SBVYOK)?>-N,ATV!?[0)\>_6`19D<`DMJ] +M/ED"]ME^64[3.K3KF'M"D@9L>,-,3C`^%9E]740]?-G_Z5F3A:!OG"ADY>]! +M_\VF295'@V*LWZ'XA!?#L**BU/`)=$B]T&&I:2E30`&(J+7@E3-0]$D%K"*P +M5C$Q%I0JGQ>/GS5QQXJ3)<782Y!U`@ +M+$!S%F.K26)WZ3(J^ +MYQRF:IA:U5`L'%9@*AEP]]1"J\T/:8D^@"0"+\]-L@5%FHQ(P1$:RO'(@T,? +MB<.%6+A98[2-+G82==BQ5]JZ$*F_Y-=?B5K;V\D+WA*D:K9QNEX)GNE:+\X24R-;V+6$9VA(G(1B](A6>"3&6YZ*8:,QR\N(" +M,!5@LEZEKB1=\X/5^(Y+YFT7R9S9]WP_^]X-BPLR2G?C@G\RI8>O[KSL5&2G +MBX%3:!B_Q\4N]>3LU9%K+^\J95Y*PENG55F] +MG0R2/XSZS)5O5K5JLYF$+YD*FR/])7['E3%3%)_Y,'>#!0Z<4]]6O?N`=S5P/4M8^[8 +MQF9'KW6,/:>EK:K64F_6`@]$/KK:(<\MC%.!T1Q2+F=+*44_@IWO\8NE3!I) +M+I]&5A`==:.4Z2\5S\Y5Z!>XC'N>C#6^H(R(8?L<.\@8`=[*9P-E6%BT(H!9 +MB<(25]K@&[@TT$B;_N3G.3-%>:8:6Y1$I_S?6I1C +M[([_BJ7*%PX\(X8)%O:X)1MVJ[-%PRY!W""-MZ"!EQ5!O3!\3>A>J6G +MPATVX=\A+S"T:F(#HOT;W@]-7S")?)XAD]62YS,4\EN*K!BV7X1F93$+<.>P +M&G6-!Z/F.C4F]M8**7JR/(T0&V$)D;XMP>*\OWK2J+:UAOHWC!HU`PJ2-+DX +MBSU),N$4*']QI62SQ(LRCM'F">%\B#Q7_A;#IG*<&6X2_LHNFC+*R0/./ +J5".V8K`1>&?LC3PZT;V_B8@X[<#CAC]SA9Y); Sun, 18 Oct 1998 14:01:48 -0400 (EDT) +Received: from ra.sai.msu.su (ra.sai.msu.su [158.250.29.2]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA23532 for ; Sun, 18 Oct 1998 13:51:14 -0400 (EDT) +Received: from ra (ra [158.250.29.2]) + by ra.sai.msu.su (8.9.1/8.9.1) with SMTP id UAA17832; + Sun, 18 Oct 1998 20:45:25 +0300 (MSK) +Date: Sun, 18 Oct 1998 21:45:24 +0400 (MSD) +From: Oleg Bartunov +X-Sender: megera@ra +To: Tom Lane +cc: Bruce Momjian , pgsql-hackers@postgreSQL.org, + jwieck@debis.com +Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +In-Reply-To: <2292.908726689@sss.pgh.pa.us> +Message-ID: +Organization: Sternberg Astronomical Institute (Moscow University) +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Status: RO + +On Sun, 18 Oct 1998, Tom Lane wrote: + +> Date: Sun, 18 Oct 1998 12:04:49 -0400 +> From: Tom Lane +> To: Bruce Momjian +> Cc: pgsql-hackers@postgreSQL.org +> Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +> +> Bruce Momjian writes: +> > What if someone wants the rows from 500 to the end. Should we allow +> > the syntax to be: +> > SELECT ... [LIMIT count] [OFFSET offset] +> > LIMIT and OFFSET are independent. +> +> I like that syntax the best, but remember we are not inventing in +> a green field here. Isn't this a feature that already exists in +> other DBMs? We should probably copy their syntax, unless it's +> truly spectacularly awful... +> +> regards, tom lane +> + +Mysql uses LIMIT [offset,] rows +>From documentation: + + LIMIT takes one or two numeric arguments. A single argument + represents the maximum number of rows to return in a result. If two + arguments are given the first argument is the offset to the first row to + return, while the second is the maximum number of rows to return in the + result. + +What would be nice if somehow total number of rows could be returned. +This is often needed for altavista-like application. +Of course, I can do +select count(*) from sometable ... LIMIT offset, rows +and then +select ... from sometable ... LIMIT offset, rows +but this seems not elegant solution. + + Regards, + + Oleg +_____________________________________________________________ +Oleg Bartunov, sci.researcher, hostmaster of AstroNet, +Sternberg Astronomical Institute, Moscow University (Russia) +Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/ +phone: +007(095)939-16-83, +007(095)939-23-83 + + +From owner-pgsql-hackers@hub.org Sun Oct 18 14:31:12 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id OAA02288 + for ; Sun, 18 Oct 1998 14:31:10 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id OAA24844 for ; Sun, 18 Oct 1998 14:15:35 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id OAA26655; + Sun, 18 Oct 1998 14:00:03 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 18 Oct 1998 13:58:57 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA26381 + for pgsql-hackers-outgoing; Sun, 18 Oct 1998 13:58:55 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from ra.sai.msu.su (ra.sai.msu.su [158.250.29.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id NAA26367 + for ; Sun, 18 Oct 1998 13:58:49 -0400 (EDT) + (envelope-from oleg@sai.msu.su) +Received: from ra (ra [158.250.29.2]) + by ra.sai.msu.su (8.9.1/8.9.1) with SMTP id UAA18077; + Sun, 18 Oct 1998 20:58:41 +0300 (MSK) +Date: Sun, 18 Oct 1998 21:58:41 +0400 (MSD) +From: Oleg Bartunov +X-Sender: megera@ra +To: Jan Wieck +cc: PostgreSQL HACKERS +Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +In-Reply-To: +Message-ID: +Organization: Sternberg Astronomical Institute (Moscow University) +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Jan, + +I tested your patch on my Linux box and it works ok, except +aggregates functions doesn't work properly, for example +count(*) always produces 0 + +kdo=> select count(*) from work_flats limit 10,1000; +count +----- +(0 rows) + +while + +kdo=> select rooms from work_flats limit 10,1000; +rooms +----- + 3 + 3 + 3 + 3 + 3 + 3 + 3 + 3 + 3 + 3 +(10 rows) + + + Regards, + + Oleg +_____________________________________________________________ +Oleg Bartunov, sci.researcher, hostmaster of AstroNet, +Sternberg Astronomical Institute, Moscow University (Russia) +Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/ +phone: +007(095)939-16-83, +007(095)939-23-83 + + + +From wieck@sapserv.debis.de Sun Oct 18 15:17:53 1998 +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id PAA03203 + for ; Sun, 18 Oct 1998 15:17:49 -0400 (EDT) +Received: by dsh.de; id VAA01180; Sun, 18 Oct 1998 21:19:50 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma001117; Sun, 18 Oct 98 21:19:33 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id VAA25465; + Sun, 18 Oct 1998 21:17:29 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id VAA14993; + Sun, 18 Oct 1998 21:19:58 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zUvyS-000B5AC; Sun, 18 Oct 98 18:46 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zUyWO-000EBPC; Sun, 18 Oct 98 21:29 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +To: oleg@sai.msu.su (Oleg Bartunov) +Date: Sun, 18 Oct 1998 21:29:43 +0200 (MET DST) +Cc: tgl@sss.pgh.pa.us, maillist@candle.pha.pa.us, pgsql-hackers@postgreSQL.org, + jwieck@debis.com +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: from "Oleg Bartunov" at Oct 18, 98 09:45:24 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: RO + +Oleg Bartunov wrote: + +> On Sun, 18 Oct 1998, Tom Lane wrote: +> +> > Bruce Momjian writes: +> > > What if someone wants the rows from 500 to the end. Should we allow +> > > the syntax to be: +> > > SELECT ... [LIMIT count] [OFFSET offset] +> > > LIMIT and OFFSET are independent. +> > +> > I like that syntax the best, but remember we are not inventing in +> > a green field here. Isn't this a feature that already exists in +> > other DBMs? We should probably copy their syntax, unless it's +> > truly spectacularly awful... +> > +> > regards, tom lane +> > +> +> Mysql uses LIMIT [offset,] rows +> >From documentation: +> +> LIMIT takes one or two numeric arguments. A single argument +> represents the maximum number of rows to return in a result. If two +> arguments are given the first argument is the offset to the first row to +> return, while the second is the maximum number of rows to return in the +> result. + + Simple change, just flip them in gram.y. + + And for the 500 to end: + + SELECT ... LIMIT 500, 0 (after flipped) + + The 0 has the same meaning as ALL. And that could also be + added to the parser easily so one can say + + SELECT ... LIMIT 500, ALL + + too. + +> +> What would be nice if somehow total number of rows could be returned. +> This is often needed for altavista-like application. +> Of course, I can do +> select count(*) from sometable ... LIMIT offset, rows +> and then +> select ... from sometable ... LIMIT offset, rows +> but this seems not elegant solution. + + Absolutely makes no sense for me. As said in the other + posting, aggregates do the counting scan in a deeper level + and thus cannot get limited. So if you invoke an aggregate, + the whole scan is always done. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + +From owner-pgsql-hackers@hub.org Sun Oct 18 19:08:47 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA00573 + for ; Sun, 18 Oct 1998 19:08:46 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id QAA01305 for ; Sun, 18 Oct 1998 16:14:30 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id PAA06110; + Sun, 18 Oct 1998 15:55:20 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 18 Oct 1998 15:54:07 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id PAA05771 + for pgsql-hackers-outgoing; Sun, 18 Oct 1998 15:54:05 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id PAA05753 + for ; Sun, 18 Oct 1998 15:53:52 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id VAA09240; Sun, 18 Oct 1998 21:56:10 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma008902; Sun, 18 Oct 98 21:55:19 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id VAA28158; + Sun, 18 Oct 1998 21:53:16 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id VAA15349; + Sun, 18 Oct 1998 21:55:45 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zUwX6-000B5AC; Sun, 18 Oct 98 19:22 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for hackers@postgreSQL.org + id m0zUz52-000EBPC; Sun, 18 Oct 98 22:05 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +To: terry@terrym.com (Terry Mackintosh) +Date: Sun, 18 Oct 1998 22:05:31 +0200 (MET DST) +Cc: hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: from "Terry Mackintosh" at Oct 18, 98 03:58:57 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> +> On Sun, 18 Oct 1998, Tom Lane wrote: +> +> > Bruce Momjian writes: +> > > What if someone wants the rows from 500 to the end. Should we allow +> > > the syntax to be: +> > > SELECT ... [LIMIT count] [OFFSET offset] +> > > LIMIT and OFFSET are independent. +> > +> > I like that syntax the best, but remember we are not inventing in +> > a green field here. Isn't this a feature that already exists in +> > other DBMs? We should probably copy their syntax, unless it's +> > truly spectacularly awful... +> > +> > regards, tom lane +> +> None that I have used (VFP, M$ SQL Server) that had 'LIMIT', had 'OFFSET'. +> So it would seem that the very idea of OFFSET is to break with what others +> are doing. +> +> I too like the above syntax. +> Why mimic, when you can do better? Go for it! +> + + We have a powerful parser. So we can provide + + ... [ LIMIT { rows | ALL } ] [ OFFSET skip ] + + or + + ... [ LIMIT [ skip , ] { rows | ALL } ] + + at the same time. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Sun Oct 18 19:08:39 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA00557 + for ; Sun, 18 Oct 1998 19:08:37 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id QAA03555 for ; Sun, 18 Oct 1998 16:56:03 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id QAA10374; + Sun, 18 Oct 1998 16:36:26 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 18 Oct 1998 16:35:16 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id QAA10298 + for pgsql-hackers-outgoing; Sun, 18 Oct 1998 16:35:15 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id QAA09974 + for ; Sun, 18 Oct 1998 16:32:21 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id WAA18249; Sun, 18 Oct 1998 22:34:46 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma018115; Sun, 18 Oct 98 22:34:11 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id WAA29950; + Sun, 18 Oct 1998 22:32:01 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id WAA15581; + Sun, 18 Oct 1998 22:34:28 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zUx8Z-000B5AC; Sun, 18 Oct 98 20:01 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for jwieck@debis.com + id m0zUzgV-000EBPC; Sun, 18 Oct 98 22:44 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] SELECT ... LIMIT (trial implementation) +To: jwieck@debis.com +Date: Sun, 18 Oct 1998 22:44:15 +0200 (MET DST) +Cc: terry@terrym.com, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: from "Jan Wieck" at Oct 18, 98 10:05:31 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> We have a powerful parser. So we can provide +> [...] + + This version now accepts all of the following + + ... [ LIMIT rows ] [ OFFSET skip ] + ... [ OFFSET skip ] [ LIMIT rows ] + ... [ LIMIT [ skip , ] rows ] + + rows can be a positive integer constant greater that 0, a $n + parameter (in SPI_prepare()) or the keyword ALL. 0 isn't + accepted as constant to force ALL in that case making clear + that this is wanted. In the parameter version the integer + value 0 still is used to mean ALL. + + skip can be a positive integer constant greater or equal to 0 + or a $n parameter for SPI_prepare. + + If any of these syntaxes is used in SPI_prepare()'d plans, + the given tcount argument for SPI_execp() is ignored and the + plan or parameter values are used. + + Anyone happy now? + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + +begin 644 opt_limit.diff.gz +M'XL(")%0*C8"`V]P=%]L:6UI="YD:69F`.4\:W?;-K*?U5^!:-.NY-"V*-EZ +MN?$>5:83;67)U2-MSSWWZ-`29/-&(E62BN--_=_OS``@08F4E4>;[JY/&Y'` +M8(`9#&8&@P%GSGS.#J<^"_SID><[M\'A$WTDVI:;I]7FR8EH>Y#\ +MH\%4&@94TRNBQM=3!D^'WS#V-\>=+M8SSO+KT%D$Q\OI^_#H+I]2L_+\T%ZD +MUP4/P=2>WG&L?:'5!J'ON+>B#?QW?$#=ZG]0R@YRN6M";KWGTW7H^:^YO;KB +M2\]_8$&XGL]3J3)+IF&6JC%=9JD,!75%6,?9K?..NXR&R][9BS47C3)P'>//DB\#'A:^4V0:K&2PP/D7]^8%558L$D/4 +MZU'XL.*YW$LVFA"/DI4T+H)XR3J]T4F_)``0)AL.;CE +M(P`7.;"`7XEA+J(T!+\%0]CR((8 +M4)$D^QG+*5R#M5N(%HG!HI;#$"1/-3*87%^&D+>B6A_.G!5F'$3SY4O6\UQ> +MA*4(O+KJO[&0-!S+:NYS'GO +MM9*:Y``X[DS9.\^9,T'?`038K#V0+Q`2 +M5!/SYG-0$-1A(!%@L;M>WG!?%K/AU'8O')]/L3&;J2<)3P06#E9@%,/YVIT6 +M6:&H4R,8`"0,.!A._HX7-ND+X%^#I>$QTM5%%4R-XCN:VBUE\[%_A`6E>'-H +ML+3TE17;V@-MD6W."7(P6F#X(I82H/^`+)/SDI!BJVUHA4["9I/@_6"_)TP/LAC_," +MU1GJ-"I$\!]+URD!$KA>:BT@(V#.P&6!?BECS@92/ +M=$`/I@*".)J[MGU["86KK<*N$X0==^[E5O36$54T!8YR6G+!O1-.[U@!_<:1 +M?1OW7Z2.9<\Y<'BX*"O0& +M'-FWT;A4CT2)ZG$5HZ/RM!XEL0"CSSP63A;`D(D#'(E[(9Y'332.XQ]?>+<% +M:S#H#PR6)R`>F3*,I,-Y/B(,X0L.R1R37?V/\[]' +M;QUWAI-\W1JTKB:=WIM6%SQCYKQXH0;P00U$&V34\J5JV1M?L>^^TS$[5+M2 +MO*8:9Q:1%7,:7QZW&;'=AQS=E^;,1I_";T_O)($63"MBON$T7V)G$V/=D#FM +M@UC@TF5NQNU0= +M(=URE-^SDB1X'W)=?@M*]ITB]C':>NRMGX3R_1SU)##LT$[QMBQ63IMEGZJ; +MY/;S2=6D;(RNF;0QQ$9(ZB5Z_62U%"'3M%*BMS]3*8G!_$5UDN3SUU!)>_'E +MHS62P+I;(25E[<_21]1K4AWIP9ND-A)#W*F,-BE-TT7@4JE%*YU;VHNC\YKA +MO36,OZ][<\2^[($ +M@.[$GHD*.1_H5Y8;9:/_K12+H]H4&GL;S;%/&OB!EJ=2485K*-8T:LV94RI6O0,T/]O3M9Y-3J1GED[)&3J5B +M5"J-+TM.*0'3[UEBO7SIN:G"WJRJ[\UJ];I1:YQHQ#P=B1'%GQN.$<6?$)-) +M):QV4C9J)YK0U4]*1OU4FZ4/`J78/S-API-+,Z&",QPX].!@'D"%W[-[]-V6 +M0$S(%P_2?G$11D8>PM:PZ)?`8JWN>_K6'&P%S]83!X[5V9).A^'AWJUM+)X4N.X:W&\ +M$YV;I')VCP@KSC^*1I`58M4`GHBQ:I"?$&3-:MUHELQFJ9P=934K)<.LG&J! +M$2K84O,Y$9D6;!/>*GI-_T$]$+=COO[-- +MB/;6QF'#S]"W(^*49/A3ET4<(-_M`?IP&)M;;=K-,NGS4HE +M>U75P*35$B:-"N)5=>-Y"Y"U0,7M.V[H#?B";!9,U=Q>!$)%3>]LGV*V()U> +M#W8KNI40$NM+Y29WUB&Y')-I=,2*S*=PH-A>J-:Z%Q[92^F'I]%4+Y\:]4I) +MLV;EFE&OGFJ:0DK)\+HSH<4ZZ5^/>S_V^C^3'`EM^V+7*?4K&7X@G2#-#]DF +MX4<]?2PM-V;"*>^X[SO@WV"Y +MLE^"^RS:62J,&3T)4Z:F)G7DT;8)M*"(O^PW_)Z7"-S@^*.Q'[%Q@,D`"V]J +M+U@[2@F(J/#8:@U3(0:&H@>F?2]25%Z`+H5Z;H!>KK:8>ADE";#X+YDMD`2- +M4@84J)XWD`*ZX&Z$=2N%(`6>]MT27N82A%J\9KN!B`.(!G%60`H@918H^E1V +M02I"-)9[(!2Y!AN`"=F*CI^W](/:DN/:D89=KA5Q#DV.X\KG*-.DG^29M%SM +MI'XPPV<^XW.Q]L%XC$>P_(>CUFB8KDQ.0)FA,FS]0EBG*Y]GU-`23O:23GP.5-LB/0ALJ)]=3$96EVK/<)@ +M5=3WX;DZLXQJR+:=:+%8I61(D +M\JF>FAAO'6;/2URY:UIBJ(^=E8R6,"FEYFDI>U+*IPVC7-42$D5![$W9JQ7T +M,J3<03PN`#7F&^)`X/#\S@Z&ZYNNX[X-P,W-A_Z:Y\'#S9./E1?I%!D8`$Z? +MX;R`G0`=.%4"9DL0Q%QF(M1W03(FFH9P2SAV(Q0^SE/XVI&V^"@'7,P>;/5F +MV;*CU>X2'@WL8Z4GJRF(C]D\V>&,BY06/7L2"VIQ?"GTH!M8OXM@!8\%L6?Y +M#KR%V_"N>$8V$&,F26$01I`Q*HI;[:&-R)[YQ[9K+Q[^Q39ZRM"'#$[$,<+0M]U@[OG+&$EA)1V&3,3JC$E@V12@C0'I +MX]$`4W$HT7@*15L[YWP4)-%V)HNX]MH/,**T'W%ZQ/1)H;N%K=G10YI$B)J= +MXB9`4F2MO%/69+/AVA7-ZJQ<;I9+S4HU6]#`13[5MMOX:FYGX4]MD!?O]AA^ +M46K2T^TQ?K998T_1#SQ^;T_#S2Q\A7-U.\'M592.+WW\JW%WU/GAUY&5P+>\ +M0?A[#%A@@SV2"W,YF&+8B9#*!*<77FX>R($R1&A8G'T'ZDW4()7BD5"@JQ2A +MH!W[]L/D!D1O%J`[%6P4B6#F^Y4OV]AAZ`,@2.,DM/U;'LKRC8*R +M.(D#5FA]P1"C@P)VZWOK532J$'GG@XO9O<>?>3I>T^"'.?&(H^THD(+<@B<4PQ43KM:,9\(_5>=6#AT'_"OY%GX$:O1JT>@#T:M`?0\O7K3>=WBOX[8\'@(D@.M"J +MT^M96``/0ZLW[(PZ;RSQ,ACA[\@:O&EUZ:D/_P[%-/ZSCVU_M'XU6+?5>S5N +MO8)&7:MU07UTK4MHV^W\B(7]=DL.Z*I%@[_J],8CHN:JWQN]QH=>ZTH=Z?=: +MHTZ_ASW"TWA`#^W7+2"T9_T"6'M]_!\?@%!J.[ZR!IVV:-V_-%B_A_]W86C] +M:\0%OP/\_\+"'^B:Z*<]2,&_YUV9TJYHV&4=%31#J]]L"Z +MLGK$D`OK%_QY#;V-AL2I$="-K.E%K`>>M/M7UYTN]M7M`$0/68!@R`@Q''CZ +MDZ_ZFS@=7N7`.^@853@;]# +M:X0_,`4]P#'HPV`&8W5N/[1^&EN]-@#"6#HX1P".O!^^1L#AJ(4RA)$[R;WA +MZ`+E!GY@`@P,5P)K8!R$;=Q3G!KW1AU`]J;5'H^O\+?;`:`WUN"'_M"BAR%P +M4&:25&"NM&/P?_>YNKPDEO^GSEE&3`;TW:E^Q@5OIT:Y6M*T.N0SKC*+)WI297/39/.M#IQSF>4=(F--T($X5VA9``2,]H*QE#1]AZ` +MS`V@QSBI47%[9T?B=#&KAT3MHR+U^7/M?,)5'!#56>S'C,!:28\)UDSPSQ)I +M0[^SUK`=)3!\8-1/_OO\&7N4]1=6"L"Y!G!\8%U=CWX].$[#[N&&9RX>GK_#-6 +MWG_ABOC+3[;P">3'09Z.];[E#_>>/PO23QCBVITQWQCLH^.^6M-D[)?.&78< +M,H`S8Y;C_(@/>8K?Y>4&Z=%0A>!+.V&0CW=,<97+_7P4*X#B0U$,PPKS*EH0 +M0P?<%68BGP@K:``AMV?Y:$,65X385>0CR(P"ON"S\.\",=$1O-TQU) +M*A6S;%3,Q&66A5#2?07-1C$=B#$1\='=%-AKF\BH!Y +MF?YZP2/?%P\X9@S3F1$)G8O)MB)U+1"W-PKJ;MP$06!`V.);%#_!->_F_P[/ +M@75T'+OK9#&<>JOC%=&^(5Z)FG2A2H"DB%(M792RFYTT2^7=)XLEXR1*9=>. +M_38_&Z;._62:"![ML0.1\]]726-X9Y"NUT0%8,Z!6<\2WRBY%EEGT4=> +M?;PD]2LE^%FGA>V*8GB0=RTE!7_`QUL^@8+]O\^R)XFIJK]2,BJ5:.UBKC:S +MI^&:U*"_=J.;7D='3WPK;+_DT\&XIZZ0)>F+7MX +M-8/FLHP2U>++>&SK.RI&6F:J_+"7HE7[1-H?3ZLV>UO?Z?K3J$^=^5K%J-1J +M\0$7F;`)TE\0'[V*LON+4?ZIB-KB&<66"">U[J/@N.PC\B.^5!]:PXT$YLUR +M/45QV]Y*!9[\N!@^'-V1;=Q1G[2].P"W+?#IAC.W;^-RLW32-'>DDM4K1EW_ +MU!;(0.CA=C@0]]KB.VWR6P3R:VU*2_'WL$EQQ9>F2!MI-_'\<*]/3)']TO%L +M?1OM"WR=ZRP>:_3%,D0,RV??048(\#-]-%#"0C$;'[@2!@5AM&%CL^"^P:+; +M5."_&5HKNDY4U&XL_#2V!K].R!452?PT*;%-^<^:E'W\CG^+JB&>KIFN&)MI5FY;19V7$KMUHRJMJ=7'RMZ?FEF`Q$GH)V +M?'B&]HG>`V;[',R7^Q8V06MWQGUA*GW^SO'6@7X[_T!>M1/[.[HIB&CUK$]$ +M^S?\%%#RKGCHB?3AM)8BV3.7W5*F#--ZD9:5`BZPG<-JM#4NC%K8U(C8&SO@ +MN),5=RRP$98PM;=E6)SU&=1JI6%4]8\:5T],*(CO$$07+>,,7$&!]@G6PI2R +M4HLXQJGXH*X8HKC.>0/;4?_A3-R:LIEXI<^Y$&5%B>8?B48T8[`01&?TQNZ= +>\,Y;ATP>0^,Y\->; Mon, 19 Oct 1998 07:31:09 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id HAA13574 for ; Mon, 19 Oct 1998 07:12:57 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id GAA13957; + Mon, 19 Oct 1998 06:25:09 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 19 Oct 1998 06:22:35 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id GAA13581 + for pgsql-hackers-outgoing; Mon, 19 Oct 1998 06:22:33 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id GAA13566 + for ; Mon, 19 Oct 1998 06:22:27 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id MAA13918; Mon, 19 Oct 1998 12:21:16 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma013635; Mon, 19 Oct 98 12:20:55 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id MAA11037; + Mon, 19 Oct 1998 12:18:27 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id MAA29382; + Mon, 19 Oct 1998 12:20:49 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVA2V-000B5AC; Mon, 19 Oct 98 09:47 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVCaT-000EBPC; Mon, 19 Oct 98 12:30 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: Inoue@tpf.co.jp (Hiroshi Inoue) +Date: Mon, 19 Oct 1998 12:30:52 +0200 (MET DST) +Cc: jwieck@debis.com, pgsql-hackers@postgreSQL.org, maillist@candle.pha.pa.us +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <002801bdfb46$39ad8ec0$2801007e@cadzone.tpf.co.jp> from "Hiroshi Inoue" at Oct 19, 98 06:52:46 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Hiroshi Inoue wrote: + +> When using cursors,in most cases the response to get first(next) rows +> is necessary for me,not the throughput. +> How can we tell PostgreSQL optimzer that the response is necessary ? + + With my LIMIT patch, the offset and the row count are part of + the querytree. And if a LIMIT is given, the limitCount elemet + of the querytree (a Node *) isn't NULL what it is by default. + + When a LIMIT is given, the optimizer could assume that first + rows is wanted (even if the limit is ALL maybe - but I have + to think about this some more). And this assumption might let + it decide to use an index to resolve an ORDER BY even if no + qualification was given. + + Telling the optimizer that first rows wanted in a cursor + operation would read + + DECLARE CURSOR c FOR SELECT * FROM mytab ORDER BY a LIMIT ALL; + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Tue Oct 20 06:01:49 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id GAA02483 + for ; Tue, 20 Oct 1998 06:01:48 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id FAA07799 for ; Tue, 20 Oct 1998 05:51:19 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id FAA00108; + Tue, 20 Oct 1998 05:17:58 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 20 Oct 1998 05:16:37 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id FAA29953 + for pgsql-hackers-outgoing; Tue, 20 Oct 1998 05:16:35 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id FAA29939 + for ; Tue, 20 Oct 1998 05:16:27 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id LAA04585; Tue, 20 Oct 1998 11:15:05 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma004337; Tue, 20 Oct 98 11:14:46 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id LAA14628; + Tue, 20 Oct 1998 11:12:27 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id LAA03564; + Tue, 20 Oct 1998 11:14:52 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVVUa-000B5AC; Tue, 20 Oct 98 08:42 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVY2c-000EBPC; Tue, 20 Oct 98 11:25 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: Inoue@tpf.co.jp (Hiroshi Inoue) +Date: Tue, 20 Oct 1998 11:25:22 +0200 (MET DST) +Cc: maillist@candle.pha.pa.us, jwieck@debis.com, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <000601bdfc03$02e67100$2801007e@cadzone.tpf.co.jp> from "Hiroshi Inoue" at Oct 20, 98 05:24:09 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Hiroshi Inoue wrote: + +> > * Prevent psort() usage when query already using index matching ORDER BY +> > +> > +> +> I can't find the reference to descending order cases except my posting. +> If we use an index scan to remove sorts in those cases,backward positioning +> and scanning are necessary. + + I think it's only thought as a reminder that the optimizer + needs some optimization. + + That topic, and the LIMIT stuff too I think, is past 6.4 work + and may go into a 6.4.1 performance release. So when we are + after 6.4, we have enough time to work out a real solution, + instead of just throwing in a patch as a quick shot. + + What we two did where steps in the same direction. Your one + covers more situations, but after all if multiple people have + the same idea there is a good chance that it is the right + thing to do. + +> +> Let t be a table with 2 indices, index1(key1,key2), index2(key1,key3). +> i.e. key1 is common to index1 and index2. +> +> And for the query +> select * from t where key1>....; +> +> If PosgreSQL optimizer choose [ index scan on index1 ] we can't remove +> sorts from the following query. +> select * from t where key1>... order by key1,key3; +> +> Similarly if [ index scan on index2 ] are chosen we can't remove sorts +> from the following query. +> select * from t where key1>... order by key1,key2; +> +> But in both cases (clever) optimizer can choose another index for scan. + + Right. As I remember, your solution does basically the same + as my one. It does not change the optimizers decision about + the index or if an index at all is used. So I assume they + hook into the same position where depending on the order by + clause the sort node is added. And that is at the very end of + the optimizer. + + What you describe above requires changes in upper levels of + optimization. Doing that is far away from my knowledge about + the optimizer. And some of your earlier statements let me + think you aren't familiar enough with it too. We need at + least help from others to do it well. + + I don't want to dive that deep into the optimizer. There was + a far too long time where the rule system was broken and got + out of sync with the parser/optimizer capabilities. I fixed + many things in it for 6.4. My first priority now is, not to + let such a situation come up again. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From wieck@sapserv.debis.de Tue Oct 20 13:00:04 1998 +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA08269 + for ; Tue, 20 Oct 1998 13:00:01 -0400 (EDT) +Received: by dsh.de; id TAA14203; Tue, 20 Oct 1998 19:02:15 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma014037; Tue, 20 Oct 98 19:01:39 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id SAA24445; + Tue, 20 Oct 1998 18:59:16 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id TAA06159; + Tue, 20 Oct 1998 19:01:40 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVcmS-000B5AC; Tue, 20 Oct 98 16:29 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVfKV-000EBPC; Tue, 20 Oct 98 19:12 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Tue, 20 Oct 1998 19:12:19 +0200 (MET DST) +Cc: jwieck@debis.com, Inoue@tpf.co.jp, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810201645.MAA07946@candle.pha.pa.us> from "Bruce Momjian" at Oct 20, 98 12:45:49 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: ROr + +> +> I agree. Another good thing is that the LIMIT thing will not require a +> dump/reload, so it is a good candidate for a minor release. + + That's wrong, sorry. + + The limit thing as I implemented it adds 2 new variables to + the Query structure. Rewrite rules are stored as querytrees + and in the existing pg_rewrite entries that would be missing. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + +From owner-pgsql-hackers@hub.org Tue Oct 20 13:24:47 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA08484 + for ; Tue, 20 Oct 1998 13:24:45 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA01878; + Tue, 20 Oct 1998 13:00:06 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 20 Oct 1998 12:59:59 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id MAA01579 + for pgsql-hackers-outgoing; Tue, 20 Oct 1998 12:59:58 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id MAA01557 + for ; Tue, 20 Oct 1998 12:59:52 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id TAA14203; Tue, 20 Oct 1998 19:02:15 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma014037; Tue, 20 Oct 98 19:01:39 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id SAA24445; + Tue, 20 Oct 1998 18:59:16 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id TAA06159; + Tue, 20 Oct 1998 19:01:40 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVcmS-000B5AC; Tue, 20 Oct 98 16:29 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVfKV-000EBPC; Tue, 20 Oct 98 19:12 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Tue, 20 Oct 1998 19:12:19 +0200 (MET DST) +Cc: jwieck@debis.com, Inoue@tpf.co.jp, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810201645.MAA07946@candle.pha.pa.us> from "Bruce Momjian" at Oct 20, 98 12:45:49 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> +> I agree. Another good thing is that the LIMIT thing will not require a +> dump/reload, so it is a good candidate for a minor release. + + That's wrong, sorry. + + The limit thing as I implemented it adds 2 new variables to + the Query structure. Rewrite rules are stored as querytrees + and in the existing pg_rewrite entries that would be missing. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From wieck@sapserv.debis.de Tue Oct 20 13:10:22 1998 +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA08339 + for ; Tue, 20 Oct 1998 13:10:18 -0400 (EDT) +Received: by dsh.de; id TAA17171; Tue, 20 Oct 1998 19:12:30 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma017064; Tue, 20 Oct 98 19:12:00 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id TAA24806; + Tue, 20 Oct 1998 19:09:37 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id TAA06212; + Tue, 20 Oct 1998 19:12:01 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVcwS-000B5AC; Tue, 20 Oct 98 16:39 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVfUW-000EBPC; Tue, 20 Oct 98 19:22 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Tue, 20 Oct 1998 19:22:40 +0200 (MET DST) +Cc: jwieck@debis.com, Inoue@tpf.co.jp, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810201702.NAA08286@candle.pha.pa.us> from "Bruce Momjian" at Oct 20, 98 01:02:58 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: RO + +> +> > > +> > > I agree. Another good thing is that the LIMIT thing will not require a +> > > dump/reload, so it is a good candidate for a minor release. +> > +> > That's wrong, sorry. +> > +> > The limit thing as I implemented it adds 2 new variables to +> > the Query structure. Rewrite rules are stored as querytrees +> > and in the existing pg_rewrite entries that would be missing. +> +> Oh, sorry. I forgot. That could be tough. + + But it wouldn't hurt to add them now to have them in + place. The required out-, read- and copyfuncs are in + my patch too. This would prevent dump/load when we + later add the real LIMIT functionality. And it does + not change anything now. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +From owner-pgsql-hackers@hub.org Tue Oct 20 14:57:36 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id OAA11449 + for ; Tue, 20 Oct 1998 14:57:34 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id NAA03547; + Tue, 20 Oct 1998 13:10:38 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 20 Oct 1998 13:10:23 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id NAA03488 + for pgsql-hackers-outgoing; Tue, 20 Oct 1998 13:10:21 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.8.8/8.8.8) with ESMTP id NAA03455 + for ; Tue, 20 Oct 1998 13:10:10 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id TAA17171; Tue, 20 Oct 1998 19:12:30 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma017064; Tue, 20 Oct 98 19:12:00 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id TAA24806; + Tue, 20 Oct 1998 19:09:37 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id TAA06212; + Tue, 20 Oct 1998 19:12:01 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zVcwS-000B5AC; Tue, 20 Oct 98 16:39 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zVfUW-000EBPC; Tue, 20 Oct 98 19:22 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Tue, 20 Oct 1998 19:22:40 +0200 (MET DST) +Cc: jwieck@debis.com, Inoue@tpf.co.jp, hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810201702.NAA08286@candle.pha.pa.us> from "Bruce Momjian" at Oct 20, 98 01:02:58 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> +> > > +> > > I agree. Another good thing is that the LIMIT thing will not require a +> > > dump/reload, so it is a good candidate for a minor release. +> > +> > That's wrong, sorry. +> > +> > The limit thing as I implemented it adds 2 new variables to +> > the Query structure. Rewrite rules are stored as querytrees +> > and in the existing pg_rewrite entries that would be missing. +> +> Oh, sorry. I forgot. That could be tough. + + But it wouldn't hurt to add them now to have them in + place. The required out-, read- and copyfuncs are in + my patch too. This would prevent dump/load when we + later add the real LIMIT functionality. And it does + not change anything now. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + +From owner-pgsql-hackers@hub.org Wed Oct 21 02:35:54 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id CAA29494 + for ; Wed, 21 Oct 1998 02:35:53 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id CAA13326; + Wed, 21 Oct 1998 02:10:42 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 21 Oct 1998 02:09:35 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id CAA12900 + for pgsql-hackers-outgoing; Wed, 21 Oct 1998 02:09:33 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from candle.pha.pa.us (maillist@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.8.8/8.8.8) with ESMTP id CAA12871 + for ; Wed, 21 Oct 1998 02:09:26 -0400 (EDT) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id CAA27774; + Wed, 21 Oct 1998 02:09:27 -0400 (EDT) +From: Bruce Momjian +Message-Id: <199810210609.CAA27774@candle.pha.pa.us> +Subject: Re: [HACKERS] What about LIMIT in SELECT ? +In-Reply-To: from Jan Wieck at "Oct 20, 1998 7:22:40 pm" +To: jwieck@debis.com +Date: Wed, 21 Oct 1998 02:09:26 -0400 (EDT) +Cc: jwieck@debis.com, Inoue@tpf.co.jp, hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> > +> > > > +> > > > I agree. Another good thing is that the LIMIT thing will not require a +> > > > dump/reload, so it is a good candidate for a minor release. +> > > +> > > That's wrong, sorry. +> > > +> > > The limit thing as I implemented it adds 2 new variables to +> > > the Query structure. Rewrite rules are stored as querytrees +> > > and in the existing pg_rewrite entries that would be missing. +> > +> > Oh, sorry. I forgot. That could be tough. +> +> But it wouldn't hurt to add them now to have them in +> place. The required out-, read- and copyfuncs are in +> my patch too. This would prevent dump/load when we +> later add the real LIMIT functionality. And it does +> not change anything now. +> + +Jan, we found that I am having to require an initdb for the INET/CIDR +type, so if you want stuff to change the views/rules for the limit +addition post 6.4, please send them in and I will apply them. + +You clearly have the syntax down, so I think you should go ahead. + + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + +From wieck@sapserv.debis.de Thu Oct 22 10:20:58 1998 +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id KAA20566 + for ; Thu, 22 Oct 1998 10:20:54 -0400 (EDT) +Received: by dsh.de; id QAA09067; Thu, 22 Oct 1998 16:23:14 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma008719; Thu, 22 Oct 98 16:22:40 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA01558; + Thu, 22 Oct 1998 16:19:55 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA18978; + Thu, 22 Oct 1998 16:22:20 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zWJG2-000B5AC; Thu, 22 Oct 98 13:50 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zWLoE-000EBPC; Thu, 22 Oct 98 16:33 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] psql's help (the LIMIT stuff) +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Thu, 22 Oct 1998 16:33:50 +0200 (MET DST) +Cc: jwieck@debis.com, jose@sferacarta.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810221351.JAA19663@candle.pha.pa.us> from "Bruce Momjian" at Oct 22, 98 09:51:19 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: ROr + +> > +> > I hope the QUERY_LIMIT too. +> +> I still have that cnfify() possible fix to review for KQSO. Are you +> still thinking limit for 6.4 final, or a minor release after that? + + I posted the part that is the minimum applied to 6.4 to make + adding LIMIT later non-initdb earlier. Anyway, here it's + again. + + My LIMIT implementation that does it like the SET in the + toplevel executor (but via parsetree values) is ready for + production. I only held it back because it's feature, not + bugfix. + + Do you want it in 6.4 final? + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +diff -cr src.orig/backend/nodes/copyfuncs.c src/backend/nodes/copyfuncs.c +*** src.orig/backend/nodes/copyfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/copyfuncs.c Fri Oct 16 13:32:35 1998 +*************** +*** 1578,1583 **** +--- 1578,1586 ---- + newnode->unionClause = temp_list; + } + ++ Node_Copy(from, newnode, limitOffset); ++ Node_Copy(from, newnode, limitCount); ++ + return newnode; + } + +diff -cr src.orig/backend/nodes/outfuncs.c src/backend/nodes/outfuncs.c +*** src.orig/backend/nodes/outfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/outfuncs.c Fri Oct 16 13:30:50 1998 +*************** +*** 259,264 **** +--- 259,268 ---- + appendStringInfo(str, (node->hasSubLinks ? "true" : "false")); + appendStringInfo(str, " :unionClause "); + _outNode(str, node->unionClause); ++ appendStringInfo(str, " :limitOffset "); ++ _outNode(str, node->limitOffset); ++ appendStringInfo(str, " :limitCount "); ++ _outNode(str, node->limitCount); + } + + static void +diff -cr src.orig/backend/nodes/readfuncs.c src/backend/nodes/readfuncs.c +*** src.orig/backend/nodes/readfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/readfuncs.c Fri Oct 16 13:31:43 1998 +*************** +*** 163,168 **** +--- 163,174 ---- + token = lsptok(NULL, &length); /* skip :unionClause */ + local_node->unionClause = nodeRead(true); + ++ token = lsptok(NULL, &length); /* skip :limitOffset */ ++ local_node->limitOffset = nodeRead(true); ++ ++ token = lsptok(NULL, &length); /* skip :limitCount */ ++ local_node->limitCount = nodeRead(true); ++ + return local_node; + } + +diff -cr src.orig/include/nodes/parsenodes.h src/include/nodes/parsenodes.h +*** src.orig/include/nodes/parsenodes.h Fri Oct 16 11:53:58 1998 +--- src/include/nodes/parsenodes.h Fri Oct 16 13:35:32 1998 +*************** +*** 60,65 **** +--- 60,67 ---- + + List *unionClause; /* unions are linked under the previous + * query */ ++ Node *limitOffset; /* # of result tuples to skip */ ++ Node *limitCount; /* # of result tuples to return */ + + /* internal to planner */ + List *base_rel_list; /* base relation list */ +*************** +*** 639,644 **** +--- 641,648 ---- + char *portalname; /* the portal (cursor) to create */ + bool binary; /* a binary (internal) portal? */ + bool unionall; /* union without unique sort */ ++ Node *limitOffset; /* # of result tuples to skip */ ++ Node *limitCount; /* # of result tuples to return */ + } SelectStmt; + + + +From owner-pgsql-hackers@hub.org Thu Oct 22 11:33:41 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA01724 + for ; Thu, 22 Oct 1998 11:33:31 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id LAA12702 for ; Thu, 22 Oct 1998 11:25:02 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.8.8) with SMTP id KAA11023; + Thu, 22 Oct 1998 10:22:13 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 22 Oct 1998 10:21:07 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.8.8) id KAA10873 + for pgsql-hackers-outgoing; Thu, 22 Oct 1998 10:21:05 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.9.1/8.8.8) with ESMTP id KAA10847 + for ; Thu, 22 Oct 1998 10:21:00 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id QAA09067; Thu, 22 Oct 1998 16:23:14 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma008719; Thu, 22 Oct 98 16:22:40 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA01558; + Thu, 22 Oct 1998 16:19:55 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA18978; + Thu, 22 Oct 1998 16:22:20 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zWJG2-000B5AC; Thu, 22 Oct 98 13:50 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zWLoE-000EBPC; Thu, 22 Oct 98 16:33 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] psql's help (the LIMIT stuff) +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Thu, 22 Oct 1998 16:33:50 +0200 (MET DST) +Cc: jwieck@debis.com, jose@sferacarta.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810221351.JAA19663@candle.pha.pa.us> from "Bruce Momjian" at Oct 22, 98 09:51:19 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +> > +> > I hope the QUERY_LIMIT too. +> +> I still have that cnfify() possible fix to review for KQSO. Are you +> still thinking limit for 6.4 final, or a minor release after that? + + I posted the part that is the minimum applied to 6.4 to make + adding LIMIT later non-initdb earlier. Anyway, here it's + again. + + My LIMIT implementation that does it like the SET in the + toplevel executor (but via parsetree values) is ready for + production. I only held it back because it's feature, not + bugfix. + + Do you want it in 6.4 final? + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +diff -cr src.orig/backend/nodes/copyfuncs.c src/backend/nodes/copyfuncs.c +*** src.orig/backend/nodes/copyfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/copyfuncs.c Fri Oct 16 13:32:35 1998 +*************** +*** 1578,1583 **** +--- 1578,1586 ---- + newnode->unionClause = temp_list; + } + ++ Node_Copy(from, newnode, limitOffset); ++ Node_Copy(from, newnode, limitCount); ++ + return newnode; + } + +diff -cr src.orig/backend/nodes/outfuncs.c src/backend/nodes/outfuncs.c +*** src.orig/backend/nodes/outfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/outfuncs.c Fri Oct 16 13:30:50 1998 +*************** +*** 259,264 **** +--- 259,268 ---- + appendStringInfo(str, (node->hasSubLinks ? "true" : "false")); + appendStringInfo(str, " :unionClause "); + _outNode(str, node->unionClause); ++ appendStringInfo(str, " :limitOffset "); ++ _outNode(str, node->limitOffset); ++ appendStringInfo(str, " :limitCount "); ++ _outNode(str, node->limitCount); + } + + static void +diff -cr src.orig/backend/nodes/readfuncs.c src/backend/nodes/readfuncs.c +*** src.orig/backend/nodes/readfuncs.c Fri Oct 16 11:53:40 1998 +--- src/backend/nodes/readfuncs.c Fri Oct 16 13:31:43 1998 +*************** +*** 163,168 **** +--- 163,174 ---- + token = lsptok(NULL, &length); /* skip :unionClause */ + local_node->unionClause = nodeRead(true); + ++ token = lsptok(NULL, &length); /* skip :limitOffset */ ++ local_node->limitOffset = nodeRead(true); ++ ++ token = lsptok(NULL, &length); /* skip :limitCount */ ++ local_node->limitCount = nodeRead(true); ++ + return local_node; + } + +diff -cr src.orig/include/nodes/parsenodes.h src/include/nodes/parsenodes.h +*** src.orig/include/nodes/parsenodes.h Fri Oct 16 11:53:58 1998 +--- src/include/nodes/parsenodes.h Fri Oct 16 13:35:32 1998 +*************** +*** 60,65 **** +--- 60,67 ---- + + List *unionClause; /* unions are linked under the previous + * query */ ++ Node *limitOffset; /* # of result tuples to skip */ ++ Node *limitCount; /* # of result tuples to return */ + + /* internal to planner */ + List *base_rel_list; /* base relation list */ +*************** +*** 639,644 **** +--- 641,648 ---- + char *portalname; /* the portal (cursor) to create */ + bool binary; /* a binary (internal) portal? */ + bool unionall; /* union without unique sort */ ++ Node *limitOffset; /* # of result tuples to skip */ ++ Node *limitCount; /* # of result tuples to return */ + } SelectStmt; + + + + +From wieck@sapserv.debis.de Thu Oct 22 11:01:05 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id LAA21185 + for ; Thu, 22 Oct 1998 11:01:00 -0400 (EDT) +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id KAA09646 for ; Thu, 22 Oct 1998 10:44:36 -0400 (EDT) +Received: by dsh.de; id QAA19394; Thu, 22 Oct 1998 16:43:42 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma017268; Thu, 22 Oct 98 16:39:44 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA02988; + Thu, 22 Oct 1998 16:36:46 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA19155; + Thu, 22 Oct 1998 16:39:10 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zWJWL-000B5DC; Thu, 22 Oct 98 14:07 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zWM4W-000EBPC; Thu, 22 Oct 98 16:50 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] psql's help (the LIMIT stuff) +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Thu, 22 Oct 1998 16:50:40 +0200 (MET DST) +Cc: jwieck@debis.com, jose@sferacarta.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810221424.KAA20601@candle.pha.pa.us> from "Bruce Momjian" at Oct 22, 98 10:24:08 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Status: RO + +> +> > > > +> > > > I hope the QUERY_LIMIT too. +> > > +> > > I still have that cnfify() possible fix to review for KQSO. Are you +> > > still thinking limit for 6.4 final, or a minor release after that? +> > +> > I posted the part that is the minimum applied to 6.4 to make +> > adding LIMIT later non-initdb earlier. Anyway, here it's +> > again. +> +> Already applied. I assume it is the same as the one I applied. + + Seen, thanks. Your 'Applied' just arrived after I packed it + again. It's the same. + +> We are close to final, and can easily put it in 6.4.1, which I am sure +> we will need, and if we split CVS trees, you'll have lots of minor +> versions to pick from. :-) +> +> Seems like it would be a nice minor release item, but the problem is +> that minor releases aren't tested as much as major ones. How confident +> are you in the code? What do others thing? + + I regression tested it, and did additional tests in the + SPI/PL area. It works. It only touches the parser and the + executor. Rules, planner/optimizer just bypass the values in + the parsetree. The parser and the executor are parts of + Postgres I feel very familiar with (not so in the optimizer). + I trust in the code and would use it in a production + environment. + + It's below. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +diff -cr src.orig/backend/commands/command.c src/backend/commands/command.c +*** src.orig/backend/commands/command.c Fri Oct 16 11:53:38 1998 +--- src/backend/commands/command.c Fri Oct 16 12:56:44 1998 +*************** +*** 39,44 **** +--- 39,45 ---- + #include "utils/mcxt.h" + #include "utils/portal.h" + #include "utils/syscache.h" ++ #include "string.h" + + /* ---------------- + * PortalExecutorHeapMemory stuff +*************** +*** 101,106 **** +--- 102,108 ---- + int feature; + QueryDesc *queryDesc; + MemoryContext context; ++ Const limcount; + + /* ---------------- + * sanity checks +*************** +*** 113,118 **** +--- 115,134 ---- + } + + /* ---------------- ++ * Create a const node from the given count value ++ * ---------------- ++ */ ++ memset(&limcount, 0, sizeof(limcount)); ++ limcount.type = T_Const; ++ limcount.consttype = INT4OID; ++ limcount.constlen = sizeof(int4); ++ limcount.constvalue = (Datum)count; ++ limcount.constisnull = FALSE; ++ limcount.constbyval = TRUE; ++ limcount.constisset = FALSE; ++ limcount.constiscast = FALSE; ++ ++ /* ---------------- + * get the portal from the portal name + * ---------------- + */ +*************** +*** 176,182 **** + PortalExecutorHeapMemory = (MemoryContext) + PortalGetHeapMemory(portal); + +! ExecutorRun(queryDesc, PortalGetState(portal), feature, count); + + if (dest == None) /* MOVE */ + pfree(queryDesc); +--- 192,198 ---- + PortalExecutorHeapMemory = (MemoryContext) + PortalGetHeapMemory(portal); + +! ExecutorRun(queryDesc, PortalGetState(portal), feature, (Node *)NULL, (Node *)&limcount); + + if (dest == None) /* MOVE */ + pfree(queryDesc); +diff -cr src.orig/backend/executor/execMain.c src/backend/executor/execMain.c +*** src.orig/backend/executor/execMain.c Fri Oct 16 11:53:38 1998 +--- src/backend/executor/execMain.c Fri Oct 16 20:05:19 1998 +*************** +*** 64,69 **** +--- 64,70 ---- + static void EndPlan(Plan *plan, EState *estate); + static TupleTableSlot *ExecutePlan(EState *estate, Plan *plan, + Query *parseTree, CmdType operation, ++ int offsetTuples, + int numberTuples, ScanDirection direction, + void (*printfunc) ()); + static void ExecRetrieve(TupleTableSlot *slot, void (*printfunc) (), +*************** +*** 163,169 **** + * ---------------------------------------------------------------- + */ + TupleTableSlot * +! ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, int count) + { + CmdType operation; + Query *parseTree; +--- 164,170 ---- + * ---------------------------------------------------------------- + */ + TupleTableSlot * +! ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, Node *limoffset, Node *limcount) + { + CmdType operation; + Query *parseTree; +*************** +*** 171,176 **** +--- 172,179 ---- + TupleTableSlot *result; + CommandDest dest; + void (*destination) (); ++ int offset = 0; ++ int count = 0; + + /****************** + * sanity checks +*************** +*** 191,196 **** +--- 194,289 ---- + estate->es_processed = 0; + estate->es_lastoid = InvalidOid; + ++ /****************** ++ * if given get the offset of the LIMIT clause ++ ****************** ++ */ ++ if (limoffset != NULL) ++ { ++ Const *coffset; ++ Param *poffset; ++ ParamListInfo paramLI; ++ int i; ++ ++ switch (nodeTag(limoffset)) ++ { ++ case T_Const: ++ coffset = (Const *)limoffset; ++ offset = (int)(coffset->constvalue); ++ break; ++ ++ case T_Param: ++ poffset = (Param *)limoffset; ++ paramLI = estate->es_param_list_info; ++ ++ if (paramLI == NULL) ++ elog(ERROR, "parameter for limit offset not in executor state"); ++ for (i = 0; paramLI[i].kind != PARAM_INVALID; i++) ++ { ++ if (paramLI[i].kind == PARAM_NUM && paramLI[i].id == poffset->paramid) ++ break; ++ } ++ if (paramLI[i].kind == PARAM_INVALID) ++ elog(ERROR, "parameter for limit offset not in executor state"); ++ if (paramLI[i].isnull) ++ elog(ERROR, "limit offset cannot be NULL value"); ++ offset = (int)(paramLI[i].value); ++ ++ break; ++ ++ default: ++ elog(ERROR, "unexpected node type %d as limit offset", nodeTag(limoffset)); ++ } ++ ++ if (offset < 0) ++ elog(ERROR, "limit offset cannot be negative"); ++ } ++ ++ /****************** ++ * if given get the count of the LIMIT clause ++ ****************** ++ */ ++ if (limcount != NULL) ++ { ++ Const *ccount; ++ Param *pcount; ++ ParamListInfo paramLI; ++ int i; ++ ++ switch (nodeTag(limcount)) ++ { ++ case T_Const: ++ ccount = (Const *)limcount; ++ count = (int)(ccount->constvalue); ++ break; ++ ++ case T_Param: ++ pcount = (Param *)limcount; ++ paramLI = estate->es_param_list_info; ++ ++ if (paramLI == NULL) ++ elog(ERROR, "parameter for limit count not in executor state"); ++ for (i = 0; paramLI[i].kind != PARAM_INVALID; i++) ++ { ++ if (paramLI[i].kind == PARAM_NUM && paramLI[i].id == pcount->paramid) ++ break; ++ } ++ if (paramLI[i].kind == PARAM_INVALID) ++ elog(ERROR, "parameter for limit count not in executor state"); ++ if (paramLI[i].isnull) ++ elog(ERROR, "limit count cannot be NULL value"); ++ count = (int)(paramLI[i].value); ++ ++ break; ++ ++ default: ++ elog(ERROR, "unexpected node type %d as limit count", nodeTag(limcount)); ++ } ++ ++ if (count < 0) ++ elog(ERROR, "limit count cannot be negative"); ++ } ++ + switch (feature) + { + +*************** +*** 199,205 **** + plan, + parseTree, + operation, +! ALL_TUPLES, + ForwardScanDirection, + destination); + break; +--- 292,299 ---- + plan, + parseTree, + operation, +! offset, +! count, + ForwardScanDirection, + destination); + break; +*************** +*** 208,213 **** +--- 302,308 ---- + plan, + parseTree, + operation, ++ offset, + count, + ForwardScanDirection, + destination); +*************** +*** 222,227 **** +--- 317,323 ---- + plan, + parseTree, + operation, ++ offset, + count, + BackwardScanDirection, + destination); +*************** +*** 237,242 **** +--- 333,339 ---- + plan, + parseTree, + operation, ++ 0, + ONE_TUPLE, + ForwardScanDirection, + destination); +*************** +*** 691,696 **** +--- 788,794 ---- + Plan *plan, + Query *parseTree, + CmdType operation, ++ int offsetTuples, + int numberTuples, + ScanDirection direction, + void (*printfunc) ()) +*************** +*** 742,747 **** +--- 840,859 ---- + { + result = NULL; + break; ++ } ++ ++ /****************** ++ * For now we completely execute the plan and skip ++ * result tuples if requested by LIMIT offset. ++ * Finally we should try to do it in deeper levels ++ * if possible (during index scan) ++ * - Jan ++ ****************** ++ */ ++ if (offsetTuples > 0) ++ { ++ --offsetTuples; ++ continue; + } + + /****************** +diff -cr src.orig/backend/executor/functions.c src/backend/executor/functions.c +*** src.orig/backend/executor/functions.c Fri Oct 16 11:53:38 1998 +--- src/backend/executor/functions.c Fri Oct 16 19:01:02 1998 +*************** +*** 130,135 **** +--- 130,138 ---- + None); + estate = CreateExecutorState(); + ++ if (queryTree->limitOffset != NULL || queryTree->limitCount != NULL) ++ elog(ERROR, "LIMIT clause from SQL functions not yet implemented"); ++ + if (nargs > 0) + { + int i; +*************** +*** 200,206 **** + + feature = (LAST_POSTQUEL_COMMAND(es)) ? EXEC_RETONE : EXEC_RUN; + +! return ExecutorRun(es->qd, es->estate, feature, 0); + } + + static void +--- 203,209 ---- + + feature = (LAST_POSTQUEL_COMMAND(es)) ? EXEC_RETONE : EXEC_RUN; + +! return ExecutorRun(es->qd, es->estate, feature, (Node *)NULL, (Node *)NULL); + } + + static void +diff -cr src.orig/backend/executor/spi.c src/backend/executor/spi.c +*** src.orig/backend/executor/spi.c Fri Oct 16 11:53:39 1998 +--- src/backend/executor/spi.c Fri Oct 16 19:25:33 1998 +*************** +*** 791,796 **** +--- 791,798 ---- + bool isRetrieveIntoRelation = false; + char *intoName = NULL; + int res; ++ Const tcount_const; ++ Node *count = NULL; + + switch (operation) + { +*************** +*** 825,830 **** +--- 827,865 ---- + return SPI_ERROR_OPUNKNOWN; + } + ++ /* ---------------- ++ * Get the query LIMIT tuple count ++ * ---------------- ++ */ ++ if (parseTree->limitCount != NULL) ++ { ++ /* ---------------- ++ * A limit clause in the parsetree overrides the ++ * tcount parameter ++ * ---------------- ++ */ ++ count = parseTree->limitCount; ++ } ++ else ++ { ++ /* ---------------- ++ * No LIMIT clause in parsetree. Use a local Const node ++ * to put tcount into it ++ * ---------------- ++ */ ++ memset(&tcount_const, 0, sizeof(tcount_const)); ++ tcount_const.type = T_Const; ++ tcount_const.consttype = INT4OID; ++ tcount_const.constlen = sizeof(int4); ++ tcount_const.constvalue = (Datum)tcount; ++ tcount_const.constisnull = FALSE; ++ tcount_const.constbyval = TRUE; ++ tcount_const.constisset = FALSE; ++ tcount_const.constiscast = FALSE; ++ ++ count = (Node *)&tcount_const; ++ } ++ + if (state == NULL) /* plan preparation */ + return res; + #ifdef SPI_EXECUTOR_STATS +*************** +*** 845,851 **** + return SPI_OK_CURSOR; + } + +! ExecutorRun(queryDesc, state, EXEC_FOR, tcount); + + _SPI_current->processed = state->es_processed; + if (operation == CMD_SELECT && queryDesc->dest == SPI) +--- 880,886 ---- + return SPI_OK_CURSOR; + } + +! ExecutorRun(queryDesc, state, EXEC_FOR, parseTree->limitOffset, count); + + _SPI_current->processed = state->es_processed; + if (operation == CMD_SELECT && queryDesc->dest == SPI) +diff -cr src.orig/backend/parser/analyze.c src/backend/parser/analyze.c +*** src.orig/backend/parser/analyze.c Fri Oct 16 11:53:41 1998 +--- src/backend/parser/analyze.c Fri Oct 16 13:29:27 1998 +*************** +*** 180,186 **** +--- 180,190 ---- + + case T_SelectStmt: + if (!((SelectStmt *) parseTree)->portalname) ++ { + result = transformSelectStmt(pstate, (SelectStmt *) parseTree); ++ result->limitOffset = ((SelectStmt *)parseTree)->limitOffset; ++ result->limitCount = ((SelectStmt *)parseTree)->limitCount; ++ } + else + result = transformCursorStmt(pstate, (SelectStmt *) parseTree); + break; +diff -cr src.orig/backend/parser/gram.y src/backend/parser/gram.y +*** src.orig/backend/parser/gram.y Fri Oct 16 11:53:42 1998 +--- src/backend/parser/gram.y Sun Oct 18 22:20:36 1998 +*************** +*** 45,50 **** +--- 45,51 ---- + #include "catalog/catname.h" + #include "utils/elog.h" + #include "access/xact.h" ++ #include "catalog/pg_type.h" + + #ifdef MULTIBYTE + #include "mb/pg_wchar.h" +*************** +*** 163,169 **** + sort_clause, sortby_list, index_params, index_list, name_list, + from_clause, from_list, opt_array_bounds, nest_array_bounds, + expr_list, attrs, res_target_list, res_target_list2, +! def_list, opt_indirection, group_clause, TriggerFuncArgs + + %type func_return + %type set_opt +--- 164,171 ---- + sort_clause, sortby_list, index_params, index_list, name_list, + from_clause, from_list, opt_array_bounds, nest_array_bounds, + expr_list, attrs, res_target_list, res_target_list2, +! def_list, opt_indirection, group_clause, TriggerFuncArgs, +! opt_select_limit + + %type func_return + %type set_opt +*************** +*** 192,197 **** +--- 194,201 ---- + + %type fetch_how_many + ++ %type select_limit_value select_offset_value ++ + %type OptSeqList + %type OptSeqElem + +*************** +*** 267,273 **** + FALSE_P, FETCH, FLOAT, FOR, FOREIGN, FROM, FULL, + GRANT, GROUP, HAVING, HOUR_P, + IN, INNER_P, INSENSITIVE, INSERT, INTERVAL, INTO, IS, +! JOIN, KEY, LANGUAGE, LEADING, LEFT, LIKE, LOCAL, + MATCH, MINUTE_P, MONTH_P, NAMES, + NATIONAL, NATURAL, NCHAR, NEXT, NO, NOT, NULL_P, NUMERIC, + OF, ON, ONLY, OPTION, OR, ORDER, OUTER_P, +--- 271,277 ---- + FALSE_P, FETCH, FLOAT, FOR, FOREIGN, FROM, FULL, + GRANT, GROUP, HAVING, HOUR_P, + IN, INNER_P, INSENSITIVE, INSERT, INTERVAL, INTO, IS, +! JOIN, KEY, LANGUAGE, LEADING, LEFT, LIKE, LIMIT, LOCAL, + MATCH, MINUTE_P, MONTH_P, NAMES, + NATIONAL, NATURAL, NCHAR, NEXT, NO, NOT, NULL_P, NUMERIC, + OF, ON, ONLY, OPTION, OR, ORDER, OUTER_P, +*************** +*** 299,305 **** + INCREMENT, INDEX, INHERITS, INSTEAD, ISNULL, + LANCOMPILER, LISTEN, LOAD, LOCATION, LOCK_P, MAXVALUE, MINVALUE, MOVE, + NEW, NOCREATEDB, NOCREATEUSER, NONE, NOTHING, NOTIFY, NOTNULL, +! OIDS, OPERATOR, PASSWORD, PROCEDURAL, + RECIPE, RENAME, RESET, RETURNS, ROW, RULE, + SEQUENCE, SERIAL, SETOF, SHOW, START, STATEMENT, STDIN, STDOUT, TRUSTED, + UNLISTEN, UNTIL, VACUUM, VALID, VERBOSE, VERSION +--- 303,309 ---- + INCREMENT, INDEX, INHERITS, INSTEAD, ISNULL, + LANCOMPILER, LISTEN, LOAD, LOCATION, LOCK_P, MAXVALUE, MINVALUE, MOVE, + NEW, NOCREATEDB, NOCREATEUSER, NONE, NOTHING, NOTIFY, NOTNULL, +! OFFSET, OIDS, OPERATOR, PASSWORD, PROCEDURAL, + RECIPE, RENAME, RESET, RETURNS, ROW, RULE, + SEQUENCE, SERIAL, SETOF, SHOW, START, STATEMENT, STDIN, STDOUT, TRUSTED, + UNLISTEN, UNTIL, VACUUM, VALID, VERBOSE, VERSION +*************** +*** 2591,2596 **** +--- 2595,2601 ---- + result from_clause where_clause + group_clause having_clause + union_clause sort_clause ++ opt_select_limit + { + SelectStmt *n = makeNode(SelectStmt); + n->unique = $2; +*************** +*** 2602,2607 **** +--- 2607,2622 ---- + n->havingClause = $8; + n->unionClause = $9; + n->sortClause = $10; ++ if ($11 != NIL) ++ { ++ n->limitOffset = nth(0, $11); ++ n->limitCount = nth(1, $11); ++ } ++ else ++ { ++ n->limitOffset = NULL; ++ n->limitCount = NULL; ++ } + $$ = (Node *)n; + } + ; +*************** +*** 2699,2704 **** +--- 2714,2794 ---- + | ASC { $$ = "<"; } + | DESC { $$ = ">"; } + | /*EMPTY*/ { $$ = "<"; /*default*/ } ++ ; ++ ++ opt_select_limit: LIMIT select_offset_value ',' select_limit_value ++ { $$ = lappend(lappend(NIL, $2), $4); } ++ | LIMIT select_limit_value OFFSET select_offset_value ++ { $$ = lappend(lappend(NIL, $4), $2); } ++ | LIMIT select_limit_value ++ { $$ = lappend(lappend(NIL, NULL), $2); } ++ | OFFSET select_offset_value LIMIT select_limit_value ++ { $$ = lappend(lappend(NIL, $2), $4); } ++ | OFFSET select_offset_value ++ { $$ = lappend(lappend(NIL, $2), NULL); } ++ | /* EMPTY */ ++ { $$ = NIL; } ++ ; ++ ++ select_limit_value: Iconst ++ { ++ Const *n = makeNode(Const); ++ ++ if ($1 < 1) ++ elog(ERROR, "selection limit must be ALL or a positive integer value > 0"); ++ ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)$1; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | ALL ++ { ++ Const *n = makeNode(Const); ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)0; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | PARAM ++ { ++ Param *n = makeNode(Param); ++ n->paramkind = PARAM_NUM; ++ n->paramid = $1; ++ n->paramtype = INT4OID; ++ $$ = (Node *)n; ++ } ++ ; ++ ++ select_offset_value: Iconst ++ { ++ Const *n = makeNode(Const); ++ ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)$1; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | PARAM ++ { ++ Param *n = makeNode(Param); ++ n->paramkind = PARAM_NUM; ++ n->paramid = $1; ++ n->paramtype = INT4OID; ++ $$ = (Node *)n; ++ } + ; + + /* +diff -cr src.orig/backend/parser/keywords.c src/backend/parser/keywords.c +*** src.orig/backend/parser/keywords.c Fri Oct 16 11:53:42 1998 +--- src/backend/parser/keywords.c Sun Oct 18 22:13:29 1998 +*************** +*** 128,133 **** +--- 128,134 ---- + {"leading", LEADING}, + {"left", LEFT}, + {"like", LIKE}, ++ {"limit", LIMIT}, + {"listen", LISTEN}, + {"load", LOAD}, + {"local", LOCAL}, +*************** +*** 156,161 **** +--- 157,163 ---- + {"null", NULL_P}, + {"numeric", NUMERIC}, + {"of", OF}, ++ {"offset", OFFSET}, + {"oids", OIDS}, + {"old", CURRENT}, + {"on", ON}, +diff -cr src.orig/backend/rewrite/rewriteDefine.c src/backend/rewrite/rewriteDefine.c +*** src.orig/backend/rewrite/rewriteDefine.c Fri Oct 16 11:53:46 1998 +--- src/backend/rewrite/rewriteDefine.c Fri Oct 16 13:48:55 1998 +*************** +*** 312,317 **** +--- 312,323 ---- + heap_close(event_relation); + + /* ++ * LIMIT in view is not supported ++ */ ++ if (query->limitOffset != NULL || query->limitCount != NULL) ++ elog(ERROR, "LIMIT clause not supported in views"); ++ ++ /* + * ... and finally the rule must be named _RETviewname. + */ + sprintf(expected_name, "_RET%s", event_obj->relname); +diff -cr src.orig/backend/tcop/pquery.c src/backend/tcop/pquery.c +*** src.orig/backend/tcop/pquery.c Fri Oct 16 11:53:47 1998 +--- src/backend/tcop/pquery.c Fri Oct 16 14:02:36 1998 +*************** +*** 40,46 **** + #include "commands/command.h" + + static char *CreateOperationTag(int operationType); +! static void ProcessQueryDesc(QueryDesc *queryDesc); + + + /* ---------------------------------------------------------------- +--- 40,46 ---- + #include "commands/command.h" + + static char *CreateOperationTag(int operationType); +! static void ProcessQueryDesc(QueryDesc *queryDesc, Node *limoffset, Node *limcount); + + + /* ---------------------------------------------------------------- +*************** +*** 205,211 **** + * ---------------------------------------------------------------- + */ + static void +! ProcessQueryDesc(QueryDesc *queryDesc) + { + Query *parseTree; + Plan *plan; +--- 205,211 ---- + * ---------------------------------------------------------------- + */ + static void +! ProcessQueryDesc(QueryDesc *queryDesc, Node *limoffset, Node *limcount) + { + Query *parseTree; + Plan *plan; +*************** +*** 330,336 **** + * actually run the plan.. + * ---------------- + */ +! ExecutorRun(queryDesc, state, EXEC_RUN, 0); + + /* save infos for EndCommand */ + UpdateCommandInfo(operation, state->es_lastoid, state->es_processed); +--- 330,336 ---- + * actually run the plan.. + * ---------------- + */ +! ExecutorRun(queryDesc, state, EXEC_RUN, limoffset, limcount); + + /* save infos for EndCommand */ + UpdateCommandInfo(operation, state->es_lastoid, state->es_processed); +*************** +*** 373,377 **** + print_plan(plan, parsetree); + } + else +! ProcessQueryDesc(queryDesc); + } +--- 373,377 ---- + print_plan(plan, parsetree); + } + else +! ProcessQueryDesc(queryDesc, parsetree->limitOffset, parsetree->limitCount); + } +diff -cr src.orig/include/executor/executor.h src/include/executor/executor.h +*** src.orig/include/executor/executor.h Fri Oct 16 11:53:56 1998 +--- src/include/executor/executor.h Fri Oct 16 12:04:17 1998 +*************** +*** 83,89 **** + * prototypes from functions in execMain.c + */ + extern TupleDesc ExecutorStart(QueryDesc *queryDesc, EState *estate); +! extern TupleTableSlot *ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, int count); + extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate); + extern HeapTuple ExecConstraints(char *caller, Relation rel, HeapTuple tuple); + #ifdef QUERY_LIMIT +--- 83,89 ---- + * prototypes from functions in execMain.c + */ + extern TupleDesc ExecutorStart(QueryDesc *queryDesc, EState *estate); +! extern TupleTableSlot *ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, Node *limoffset, Node *limcount); + extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate); + extern HeapTuple ExecConstraints(char *caller, Relation rel, HeapTuple tuple); + #ifdef QUERY_LIMIT + +From owner-pgsql-hackers@hub.org Thu Oct 22 13:12:34 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA01350 + for ; Thu, 22 Oct 1998 13:12:29 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA17808 for ; Thu, 22 Oct 1998 12:35:22 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.8.8) with SMTP id KAA14887; + Thu, 22 Oct 1998 10:49:09 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 22 Oct 1998 10:44:59 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.8.8) id KAA14445 + for pgsql-hackers-outgoing; Thu, 22 Oct 1998 10:44:57 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from dsh.de (firewall-user@neptun.sns-felb.debis.de [53.122.101.2]) + by hub.org (8.9.1/8.8.8) with ESMTP id KAA14431 + for ; Thu, 22 Oct 1998 10:44:47 -0400 (EDT) + (envelope-from wieck@sapserv.debis.de) +Received: by dsh.de; id QAA19394; Thu, 22 Oct 1998 16:43:42 +0200 (MET DST) +Received: from dshmail.dsh.de(53.47.15.3) by neptun.dsh.de via smap (3.2) + id xma017268; Thu, 22 Oct 98 16:39:44 +0200 +Received: from mail1.hh1.dsh.de (mail1.hh1.dsh.de [53.47.9.5]) + by dshmail.dsh.de (8.8.7/8.8.7) with ESMTP id QAA02988; + Thu, 22 Oct 1998 16:36:46 +0200 (MET DST) +Received: from mars.SAPserv.Hamburg.dsh.de (root@mail5.hh1.dsh.de [53.2.168.17]) + by mail1.hh1.dsh.de (8.8.7/8.8.7) with SMTP id QAA19155; + Thu, 22 Oct 1998 16:39:10 +0200 +Received: from orion.SAPserv.Hamburg.dsh.de + by mars.SAPserv.Hamburg.dsh.de with smtp + for <> + id m0zWJWL-000B5DC; Thu, 22 Oct 98 14:07 MET DST +Received: by orion.SAPserv.Hamburg.dsh.de + for maillist@candle.pha.pa.us + id m0zWM4W-000EBPC; Thu, 22 Oct 98 16:50 MET DST +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] psql's help (the LIMIT stuff) +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Thu, 22 Oct 1998 16:50:40 +0200 (MET DST) +Cc: jwieck@debis.com, jose@sferacarta.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199810221424.KAA20601@candle.pha.pa.us> from "Bruce Momjian" at Oct 22, 98 10:24:08 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> +> > > > +> > > > I hope the QUERY_LIMIT too. +> > > +> > > I still have that cnfify() possible fix to review for KQSO. Are you +> > > still thinking limit for 6.4 final, or a minor release after that? +> > +> > I posted the part that is the minimum applied to 6.4 to make +> > adding LIMIT later non-initdb earlier. Anyway, here it's +> > again. +> +> Already applied. I assume it is the same as the one I applied. + + Seen, thanks. Your 'Applied' just arrived after I packed it + again. It's the same. + +> We are close to final, and can easily put it in 6.4.1, which I am sure +> we will need, and if we split CVS trees, you'll have lots of minor +> versions to pick from. :-) +> +> Seems like it would be a nice minor release item, but the problem is +> that minor releases aren't tested as much as major ones. How confident +> are you in the code? What do others thing? + + I regression tested it, and did additional tests in the + SPI/PL area. It works. It only touches the parser and the + executor. Rules, planner/optimizer just bypass the values in + the parsetree. The parser and the executor are parts of + Postgres I feel very familiar with (not so in the optimizer). + I trust in the code and would use it in a production + environment. + + It's below. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + +diff -cr src.orig/backend/commands/command.c src/backend/commands/command.c +*** src.orig/backend/commands/command.c Fri Oct 16 11:53:38 1998 +--- src/backend/commands/command.c Fri Oct 16 12:56:44 1998 +*************** +*** 39,44 **** +--- 39,45 ---- + #include "utils/mcxt.h" + #include "utils/portal.h" + #include "utils/syscache.h" ++ #include "string.h" + + /* ---------------- + * PortalExecutorHeapMemory stuff +*************** +*** 101,106 **** +--- 102,108 ---- + int feature; + QueryDesc *queryDesc; + MemoryContext context; ++ Const limcount; + + /* ---------------- + * sanity checks +*************** +*** 113,118 **** +--- 115,134 ---- + } + + /* ---------------- ++ * Create a const node from the given count value ++ * ---------------- ++ */ ++ memset(&limcount, 0, sizeof(limcount)); ++ limcount.type = T_Const; ++ limcount.consttype = INT4OID; ++ limcount.constlen = sizeof(int4); ++ limcount.constvalue = (Datum)count; ++ limcount.constisnull = FALSE; ++ limcount.constbyval = TRUE; ++ limcount.constisset = FALSE; ++ limcount.constiscast = FALSE; ++ ++ /* ---------------- + * get the portal from the portal name + * ---------------- + */ +*************** +*** 176,182 **** + PortalExecutorHeapMemory = (MemoryContext) + PortalGetHeapMemory(portal); + +! ExecutorRun(queryDesc, PortalGetState(portal), feature, count); + + if (dest == None) /* MOVE */ + pfree(queryDesc); +--- 192,198 ---- + PortalExecutorHeapMemory = (MemoryContext) + PortalGetHeapMemory(portal); + +! ExecutorRun(queryDesc, PortalGetState(portal), feature, (Node *)NULL, (Node *)&limcount); + + if (dest == None) /* MOVE */ + pfree(queryDesc); +diff -cr src.orig/backend/executor/execMain.c src/backend/executor/execMain.c +*** src.orig/backend/executor/execMain.c Fri Oct 16 11:53:38 1998 +--- src/backend/executor/execMain.c Fri Oct 16 20:05:19 1998 +*************** +*** 64,69 **** +--- 64,70 ---- + static void EndPlan(Plan *plan, EState *estate); + static TupleTableSlot *ExecutePlan(EState *estate, Plan *plan, + Query *parseTree, CmdType operation, ++ int offsetTuples, + int numberTuples, ScanDirection direction, + void (*printfunc) ()); + static void ExecRetrieve(TupleTableSlot *slot, void (*printfunc) (), +*************** +*** 163,169 **** + * ---------------------------------------------------------------- + */ + TupleTableSlot * +! ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, int count) + { + CmdType operation; + Query *parseTree; +--- 164,170 ---- + * ---------------------------------------------------------------- + */ + TupleTableSlot * +! ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, Node *limoffset, Node *limcount) + { + CmdType operation; + Query *parseTree; +*************** +*** 171,176 **** +--- 172,179 ---- + TupleTableSlot *result; + CommandDest dest; + void (*destination) (); ++ int offset = 0; ++ int count = 0; + + /****************** + * sanity checks +*************** +*** 191,196 **** +--- 194,289 ---- + estate->es_processed = 0; + estate->es_lastoid = InvalidOid; + ++ /****************** ++ * if given get the offset of the LIMIT clause ++ ****************** ++ */ ++ if (limoffset != NULL) ++ { ++ Const *coffset; ++ Param *poffset; ++ ParamListInfo paramLI; ++ int i; ++ ++ switch (nodeTag(limoffset)) ++ { ++ case T_Const: ++ coffset = (Const *)limoffset; ++ offset = (int)(coffset->constvalue); ++ break; ++ ++ case T_Param: ++ poffset = (Param *)limoffset; ++ paramLI = estate->es_param_list_info; ++ ++ if (paramLI == NULL) ++ elog(ERROR, "parameter for limit offset not in executor state"); ++ for (i = 0; paramLI[i].kind != PARAM_INVALID; i++) ++ { ++ if (paramLI[i].kind == PARAM_NUM && paramLI[i].id == poffset->paramid) ++ break; ++ } ++ if (paramLI[i].kind == PARAM_INVALID) ++ elog(ERROR, "parameter for limit offset not in executor state"); ++ if (paramLI[i].isnull) ++ elog(ERROR, "limit offset cannot be NULL value"); ++ offset = (int)(paramLI[i].value); ++ ++ break; ++ ++ default: ++ elog(ERROR, "unexpected node type %d as limit offset", nodeTag(limoffset)); ++ } ++ ++ if (offset < 0) ++ elog(ERROR, "limit offset cannot be negative"); ++ } ++ ++ /****************** ++ * if given get the count of the LIMIT clause ++ ****************** ++ */ ++ if (limcount != NULL) ++ { ++ Const *ccount; ++ Param *pcount; ++ ParamListInfo paramLI; ++ int i; ++ ++ switch (nodeTag(limcount)) ++ { ++ case T_Const: ++ ccount = (Const *)limcount; ++ count = (int)(ccount->constvalue); ++ break; ++ ++ case T_Param: ++ pcount = (Param *)limcount; ++ paramLI = estate->es_param_list_info; ++ ++ if (paramLI == NULL) ++ elog(ERROR, "parameter for limit count not in executor state"); ++ for (i = 0; paramLI[i].kind != PARAM_INVALID; i++) ++ { ++ if (paramLI[i].kind == PARAM_NUM && paramLI[i].id == pcount->paramid) ++ break; ++ } ++ if (paramLI[i].kind == PARAM_INVALID) ++ elog(ERROR, "parameter for limit count not in executor state"); ++ if (paramLI[i].isnull) ++ elog(ERROR, "limit count cannot be NULL value"); ++ count = (int)(paramLI[i].value); ++ ++ break; ++ ++ default: ++ elog(ERROR, "unexpected node type %d as limit count", nodeTag(limcount)); ++ } ++ ++ if (count < 0) ++ elog(ERROR, "limit count cannot be negative"); ++ } ++ + switch (feature) + { + +*************** +*** 199,205 **** + plan, + parseTree, + operation, +! ALL_TUPLES, + ForwardScanDirection, + destination); + break; +--- 292,299 ---- + plan, + parseTree, + operation, +! offset, +! count, + ForwardScanDirection, + destination); + break; +*************** +*** 208,213 **** +--- 302,308 ---- + plan, + parseTree, + operation, ++ offset, + count, + ForwardScanDirection, + destination); +*************** +*** 222,227 **** +--- 317,323 ---- + plan, + parseTree, + operation, ++ offset, + count, + BackwardScanDirection, + destination); +*************** +*** 237,242 **** +--- 333,339 ---- + plan, + parseTree, + operation, ++ 0, + ONE_TUPLE, + ForwardScanDirection, + destination); +*************** +*** 691,696 **** +--- 788,794 ---- + Plan *plan, + Query *parseTree, + CmdType operation, ++ int offsetTuples, + int numberTuples, + ScanDirection direction, + void (*printfunc) ()) +*************** +*** 742,747 **** +--- 840,859 ---- + { + result = NULL; + break; ++ } ++ ++ /****************** ++ * For now we completely execute the plan and skip ++ * result tuples if requested by LIMIT offset. ++ * Finally we should try to do it in deeper levels ++ * if possible (during index scan) ++ * - Jan ++ ****************** ++ */ ++ if (offsetTuples > 0) ++ { ++ --offsetTuples; ++ continue; + } + + /****************** +diff -cr src.orig/backend/executor/functions.c src/backend/executor/functions.c +*** src.orig/backend/executor/functions.c Fri Oct 16 11:53:38 1998 +--- src/backend/executor/functions.c Fri Oct 16 19:01:02 1998 +*************** +*** 130,135 **** +--- 130,138 ---- + None); + estate = CreateExecutorState(); + ++ if (queryTree->limitOffset != NULL || queryTree->limitCount != NULL) ++ elog(ERROR, "LIMIT clause from SQL functions not yet implemented"); ++ + if (nargs > 0) + { + int i; +*************** +*** 200,206 **** + + feature = (LAST_POSTQUEL_COMMAND(es)) ? EXEC_RETONE : EXEC_RUN; + +! return ExecutorRun(es->qd, es->estate, feature, 0); + } + + static void +--- 203,209 ---- + + feature = (LAST_POSTQUEL_COMMAND(es)) ? EXEC_RETONE : EXEC_RUN; + +! return ExecutorRun(es->qd, es->estate, feature, (Node *)NULL, (Node *)NULL); + } + + static void +diff -cr src.orig/backend/executor/spi.c src/backend/executor/spi.c +*** src.orig/backend/executor/spi.c Fri Oct 16 11:53:39 1998 +--- src/backend/executor/spi.c Fri Oct 16 19:25:33 1998 +*************** +*** 791,796 **** +--- 791,798 ---- + bool isRetrieveIntoRelation = false; + char *intoName = NULL; + int res; ++ Const tcount_const; ++ Node *count = NULL; + + switch (operation) + { +*************** +*** 825,830 **** +--- 827,865 ---- + return SPI_ERROR_OPUNKNOWN; + } + ++ /* ---------------- ++ * Get the query LIMIT tuple count ++ * ---------------- ++ */ ++ if (parseTree->limitCount != NULL) ++ { ++ /* ---------------- ++ * A limit clause in the parsetree overrides the ++ * tcount parameter ++ * ---------------- ++ */ ++ count = parseTree->limitCount; ++ } ++ else ++ { ++ /* ---------------- ++ * No LIMIT clause in parsetree. Use a local Const node ++ * to put tcount into it ++ * ---------------- ++ */ ++ memset(&tcount_const, 0, sizeof(tcount_const)); ++ tcount_const.type = T_Const; ++ tcount_const.consttype = INT4OID; ++ tcount_const.constlen = sizeof(int4); ++ tcount_const.constvalue = (Datum)tcount; ++ tcount_const.constisnull = FALSE; ++ tcount_const.constbyval = TRUE; ++ tcount_const.constisset = FALSE; ++ tcount_const.constiscast = FALSE; ++ ++ count = (Node *)&tcount_const; ++ } ++ + if (state == NULL) /* plan preparation */ + return res; + #ifdef SPI_EXECUTOR_STATS +*************** +*** 845,851 **** + return SPI_OK_CURSOR; + } + +! ExecutorRun(queryDesc, state, EXEC_FOR, tcount); + + _SPI_current->processed = state->es_processed; + if (operation == CMD_SELECT && queryDesc->dest == SPI) +--- 880,886 ---- + return SPI_OK_CURSOR; + } + +! ExecutorRun(queryDesc, state, EXEC_FOR, parseTree->limitOffset, count); + + _SPI_current->processed = state->es_processed; + if (operation == CMD_SELECT && queryDesc->dest == SPI) +diff -cr src.orig/backend/parser/analyze.c src/backend/parser/analyze.c +*** src.orig/backend/parser/analyze.c Fri Oct 16 11:53:41 1998 +--- src/backend/parser/analyze.c Fri Oct 16 13:29:27 1998 +*************** +*** 180,186 **** +--- 180,190 ---- + + case T_SelectStmt: + if (!((SelectStmt *) parseTree)->portalname) ++ { + result = transformSelectStmt(pstate, (SelectStmt *) parseTree); ++ result->limitOffset = ((SelectStmt *)parseTree)->limitOffset; ++ result->limitCount = ((SelectStmt *)parseTree)->limitCount; ++ } + else + result = transformCursorStmt(pstate, (SelectStmt *) parseTree); + break; +diff -cr src.orig/backend/parser/gram.y src/backend/parser/gram.y +*** src.orig/backend/parser/gram.y Fri Oct 16 11:53:42 1998 +--- src/backend/parser/gram.y Sun Oct 18 22:20:36 1998 +*************** +*** 45,50 **** +--- 45,51 ---- + #include "catalog/catname.h" + #include "utils/elog.h" + #include "access/xact.h" ++ #include "catalog/pg_type.h" + + #ifdef MULTIBYTE + #include "mb/pg_wchar.h" +*************** +*** 163,169 **** + sort_clause, sortby_list, index_params, index_list, name_list, + from_clause, from_list, opt_array_bounds, nest_array_bounds, + expr_list, attrs, res_target_list, res_target_list2, +! def_list, opt_indirection, group_clause, TriggerFuncArgs + + %type func_return + %type set_opt +--- 164,171 ---- + sort_clause, sortby_list, index_params, index_list, name_list, + from_clause, from_list, opt_array_bounds, nest_array_bounds, + expr_list, attrs, res_target_list, res_target_list2, +! def_list, opt_indirection, group_clause, TriggerFuncArgs, +! opt_select_limit + + %type func_return + %type set_opt +*************** +*** 192,197 **** +--- 194,201 ---- + + %type fetch_how_many + ++ %type select_limit_value select_offset_value ++ + %type OptSeqList + %type OptSeqElem + +*************** +*** 267,273 **** + FALSE_P, FETCH, FLOAT, FOR, FOREIGN, FROM, FULL, + GRANT, GROUP, HAVING, HOUR_P, + IN, INNER_P, INSENSITIVE, INSERT, INTERVAL, INTO, IS, +! JOIN, KEY, LANGUAGE, LEADING, LEFT, LIKE, LOCAL, + MATCH, MINUTE_P, MONTH_P, NAMES, + NATIONAL, NATURAL, NCHAR, NEXT, NO, NOT, NULL_P, NUMERIC, + OF, ON, ONLY, OPTION, OR, ORDER, OUTER_P, +--- 271,277 ---- + FALSE_P, FETCH, FLOAT, FOR, FOREIGN, FROM, FULL, + GRANT, GROUP, HAVING, HOUR_P, + IN, INNER_P, INSENSITIVE, INSERT, INTERVAL, INTO, IS, +! JOIN, KEY, LANGUAGE, LEADING, LEFT, LIKE, LIMIT, LOCAL, + MATCH, MINUTE_P, MONTH_P, NAMES, + NATIONAL, NATURAL, NCHAR, NEXT, NO, NOT, NULL_P, NUMERIC, + OF, ON, ONLY, OPTION, OR, ORDER, OUTER_P, +*************** +*** 299,305 **** + INCREMENT, INDEX, INHERITS, INSTEAD, ISNULL, + LANCOMPILER, LISTEN, LOAD, LOCATION, LOCK_P, MAXVALUE, MINVALUE, MOVE, + NEW, NOCREATEDB, NOCREATEUSER, NONE, NOTHING, NOTIFY, NOTNULL, +! OIDS, OPERATOR, PASSWORD, PROCEDURAL, + RECIPE, RENAME, RESET, RETURNS, ROW, RULE, + SEQUENCE, SERIAL, SETOF, SHOW, START, STATEMENT, STDIN, STDOUT, TRUSTED, + UNLISTEN, UNTIL, VACUUM, VALID, VERBOSE, VERSION +--- 303,309 ---- + INCREMENT, INDEX, INHERITS, INSTEAD, ISNULL, + LANCOMPILER, LISTEN, LOAD, LOCATION, LOCK_P, MAXVALUE, MINVALUE, MOVE, + NEW, NOCREATEDB, NOCREATEUSER, NONE, NOTHING, NOTIFY, NOTNULL, +! OFFSET, OIDS, OPERATOR, PASSWORD, PROCEDURAL, + RECIPE, RENAME, RESET, RETURNS, ROW, RULE, + SEQUENCE, SERIAL, SETOF, SHOW, START, STATEMENT, STDIN, STDOUT, TRUSTED, + UNLISTEN, UNTIL, VACUUM, VALID, VERBOSE, VERSION +*************** +*** 2591,2596 **** +--- 2595,2601 ---- + result from_clause where_clause + group_clause having_clause + union_clause sort_clause ++ opt_select_limit + { + SelectStmt *n = makeNode(SelectStmt); + n->unique = $2; +*************** +*** 2602,2607 **** +--- 2607,2622 ---- + n->havingClause = $8; + n->unionClause = $9; + n->sortClause = $10; ++ if ($11 != NIL) ++ { ++ n->limitOffset = nth(0, $11); ++ n->limitCount = nth(1, $11); ++ } ++ else ++ { ++ n->limitOffset = NULL; ++ n->limitCount = NULL; ++ } + $$ = (Node *)n; + } + ; +*************** +*** 2699,2704 **** +--- 2714,2794 ---- + | ASC { $$ = "<"; } + | DESC { $$ = ">"; } + | /*EMPTY*/ { $$ = "<"; /*default*/ } ++ ; ++ ++ opt_select_limit: LIMIT select_offset_value ',' select_limit_value ++ { $$ = lappend(lappend(NIL, $2), $4); } ++ | LIMIT select_limit_value OFFSET select_offset_value ++ { $$ = lappend(lappend(NIL, $4), $2); } ++ | LIMIT select_limit_value ++ { $$ = lappend(lappend(NIL, NULL), $2); } ++ | OFFSET select_offset_value LIMIT select_limit_value ++ { $$ = lappend(lappend(NIL, $2), $4); } ++ | OFFSET select_offset_value ++ { $$ = lappend(lappend(NIL, $2), NULL); } ++ | /* EMPTY */ ++ { $$ = NIL; } ++ ; ++ ++ select_limit_value: Iconst ++ { ++ Const *n = makeNode(Const); ++ ++ if ($1 < 1) ++ elog(ERROR, "selection limit must be ALL or a positive integer value > 0"); ++ ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)$1; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | ALL ++ { ++ Const *n = makeNode(Const); ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)0; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | PARAM ++ { ++ Param *n = makeNode(Param); ++ n->paramkind = PARAM_NUM; ++ n->paramid = $1; ++ n->paramtype = INT4OID; ++ $$ = (Node *)n; ++ } ++ ; ++ ++ select_offset_value: Iconst ++ { ++ Const *n = makeNode(Const); ++ ++ n->consttype = INT4OID; ++ n->constlen = sizeof(int4); ++ n->constvalue = (Datum)$1; ++ n->constisnull = FALSE; ++ n->constbyval = TRUE; ++ n->constisset = FALSE; ++ n->constiscast = FALSE; ++ $$ = (Node *)n; ++ } ++ | PARAM ++ { ++ Param *n = makeNode(Param); ++ n->paramkind = PARAM_NUM; ++ n->paramid = $1; ++ n->paramtype = INT4OID; ++ $$ = (Node *)n; ++ } + ; + + /* +diff -cr src.orig/backend/parser/keywords.c src/backend/parser/keywords.c +*** src.orig/backend/parser/keywords.c Fri Oct 16 11:53:42 1998 +--- src/backend/parser/keywords.c Sun Oct 18 22:13:29 1998 +*************** +*** 128,133 **** +--- 128,134 ---- + {"leading", LEADING}, + {"left", LEFT}, + {"like", LIKE}, ++ {"limit", LIMIT}, + {"listen", LISTEN}, + {"load", LOAD}, + {"local", LOCAL}, +*************** +*** 156,161 **** +--- 157,163 ---- + {"null", NULL_P}, + {"numeric", NUMERIC}, + {"of", OF}, ++ {"offset", OFFSET}, + {"oids", OIDS}, + {"old", CURRENT}, + {"on", ON}, +diff -cr src.orig/backend/rewrite/rewriteDefine.c src/backend/rewrite/rewriteDefine.c +*** src.orig/backend/rewrite/rewriteDefine.c Fri Oct 16 11:53:46 1998 +--- src/backend/rewrite/rewriteDefine.c Fri Oct 16 13:48:55 1998 +*************** +*** 312,317 **** +--- 312,323 ---- + heap_close(event_relation); + + /* ++ * LIMIT in view is not supported ++ */ ++ if (query->limitOffset != NULL || query->limitCount != NULL) ++ elog(ERROR, "LIMIT clause not supported in views"); ++ ++ /* + * ... and finally the rule must be named _RETviewname. + */ + sprintf(expected_name, "_RET%s", event_obj->relname); +diff -cr src.orig/backend/tcop/pquery.c src/backend/tcop/pquery.c +*** src.orig/backend/tcop/pquery.c Fri Oct 16 11:53:47 1998 +--- src/backend/tcop/pquery.c Fri Oct 16 14:02:36 1998 +*************** +*** 40,46 **** + #include "commands/command.h" + + static char *CreateOperationTag(int operationType); +! static void ProcessQueryDesc(QueryDesc *queryDesc); + + + /* ---------------------------------------------------------------- +--- 40,46 ---- + #include "commands/command.h" + + static char *CreateOperationTag(int operationType); +! static void ProcessQueryDesc(QueryDesc *queryDesc, Node *limoffset, Node *limcount); + + + /* ---------------------------------------------------------------- +*************** +*** 205,211 **** + * ---------------------------------------------------------------- + */ + static void +! ProcessQueryDesc(QueryDesc *queryDesc) + { + Query *parseTree; + Plan *plan; +--- 205,211 ---- + * ---------------------------------------------------------------- + */ + static void +! ProcessQueryDesc(QueryDesc *queryDesc, Node *limoffset, Node *limcount) + { + Query *parseTree; + Plan *plan; +*************** +*** 330,336 **** + * actually run the plan.. + * ---------------- + */ +! ExecutorRun(queryDesc, state, EXEC_RUN, 0); + + /* save infos for EndCommand */ + UpdateCommandInfo(operation, state->es_lastoid, state->es_processed); +--- 330,336 ---- + * actually run the plan.. + * ---------------- + */ +! ExecutorRun(queryDesc, state, EXEC_RUN, limoffset, limcount); + + /* save infos for EndCommand */ + UpdateCommandInfo(operation, state->es_lastoid, state->es_processed); +*************** +*** 373,377 **** + print_plan(plan, parsetree); + } + else +! ProcessQueryDesc(queryDesc); + } +--- 373,377 ---- + print_plan(plan, parsetree); + } + else +! ProcessQueryDesc(queryDesc, parsetree->limitOffset, parsetree->limitCount); + } +diff -cr src.orig/include/executor/executor.h src/include/executor/executor.h +*** src.orig/include/executor/executor.h Fri Oct 16 11:53:56 1998 +--- src/include/executor/executor.h Fri Oct 16 12:04:17 1998 +*************** +*** 83,89 **** + * prototypes from functions in execMain.c + */ + extern TupleDesc ExecutorStart(QueryDesc *queryDesc, EState *estate); +! extern TupleTableSlot *ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, int count); + extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate); + extern HeapTuple ExecConstraints(char *caller, Relation rel, HeapTuple tuple); + #ifdef QUERY_LIMIT +--- 83,89 ---- + * prototypes from functions in execMain.c + */ + extern TupleDesc ExecutorStart(QueryDesc *queryDesc, EState *estate); +! extern TupleTableSlot *ExecutorRun(QueryDesc *queryDesc, EState *estate, int feature, Node *limoffset, Node *limcount); + extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate); + extern HeapTuple ExecConstraints(char *caller, Relation rel, HeapTuple tuple); + #ifdef QUERY_LIMIT + + diff --git a/doc/TODO.detail/logging b/doc/TODO.detail/logging new file mode 100644 index 0000000000..2decf2a529 --- /dev/null +++ b/doc/TODO.detail/logging @@ -0,0 +1,207 @@ +From owner-pgsql-hackers@hub.org Fri Nov 13 13:24:37 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA13457 + for ; Fri, 13 Nov 1998 13:24:35 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id NAA02464; + Fri, 13 Nov 1998 13:22:52 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 13 Nov 1998 13:21:14 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id NAA02331 + for pgsql-hackers-outgoing; Fri, 13 Nov 1998 13:21:12 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from orion.SAPserv.Hamburg.dsh.de (Tpolaris2.sapham.debis.de [53.2.131.8]) + by hub.org (8.9.1/8.9.1) with SMTP id NAA02316 + for ; Fri, 13 Nov 1998 13:21:06 -0500 (EST) + (envelope-from wieck@sapserv.debis.de) +Received: by orion.SAPserv.Hamburg.dsh.de + for pgsql-hackers@postgreSQL.org + id m0zeOEf-000EBPC; Fri, 13 Nov 98 19:46 MET +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: [HACKERS] shmem limits and redolog +To: pgsql-hackers@postgreSQL.org (PostgreSQL HACKERS) +Date: Fri, 13 Nov 1998 19:46:20 +0100 (MET) +Reply-To: jwieck@debis.com (Jan Wieck) +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Hi, + + I'm currently hacking around on a solution for logging all + database operations at query level that can recover a crashed + database from the last successful backup by redoing all the + commands. + + Well, I wanted it to be as flexible as can. So I decided to + make it per database configurable. One could say which + databases are logged and if a database is, if it is logged + sync or async (in sync mode, every COMMIT forces an fsync of + the actual logfile and controlfiles). + + To make async mode as fast as can, I'm using a shared memory + of 32K per database (not per backend) that is used as a wrap + around buffer from the backends to place their query + information. So the log writer can fall a little behind if + there are many backends doing different things that don't + lock each other. + + Now I'm a little in doubt about the shared memory limits + reported. Was it a good decision to use shared memory? Am I + better off using socket's? + + The bad thing in what I have up to now (it's far from + complete) is, that even if a database isn't currently logged, + a redolog writer is started and creates the 32K shmem segment + (plus a semaphore set with 5 semaphores). This is because I + plan to create commands like + + ALTER DATABASE LOG MODE=ASYNC LOGDIR='/somewhere/dbname'; + + and the like that can be used at runtime (while more than one + backend is connected to the database) to turn logging on/off, + switch to/from backup mode (all other activity is stopped) + etc. + + So every 32 databases will require another megabyte of shared + memory. The logging master controls which databases have + activity and kills redolog writers after some time of + inactivity, and the shmem is freed then. But it can hurt if + someone really has many many databases that are all used at + the same time. + + What do the others say? + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Wed Dec 16 15:46:41 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id PAA00521 + for ; Wed, 16 Dec 1998 15:46:40 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id PAA08772 for ; Wed, 16 Dec 1998 15:10:01 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id PAA01254; + Wed, 16 Dec 1998 15:06:56 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 16 Dec 1998 14:58:11 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id OAA00660 + for pgsql-hackers-outgoing; Wed, 16 Dec 1998 14:58:10 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from orion.SAPserv.Hamburg.dsh.de (Tpolaris2.sapham.debis.de [53.2.131.8]) + by hub.org (8.9.1/8.9.1) with SMTP id OAA00643 + for ; Wed, 16 Dec 1998 14:58:05 -0500 (EST) + (envelope-from wieck@sapserv.debis.de) +Received: by orion.SAPserv.Hamburg.dsh.de + for pgsql-hackers@postgreSQL.org + id m0zqNDo-000EBTC; Wed, 16 Dec 98 21:07 MET +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] redolog - for discussion +To: vadim@krs.ru (Vadim Mikheev) +Date: Wed, 16 Dec 1998 21:07:00 +0100 (MET) +Cc: jwieck@debis.com, pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <3677B71D.C67462B3@krs.ru> from "Vadim Mikheev" at Dec 16, 98 08:35:25 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Vadim wrote: + +> +> Jan Wieck wrote: +> > +> > RECOVER DATABASE {ALL | UNTIL 'datetime' | RESET}; +> > +> ... +> > +> > For the others, the backend starts the recovery program +> > which reads the redolog files, establishes database +> > connections as required and reruns all the commands in +> ^^^^^^^^^^^^^^^^^^^^^^^^^^ +> > them. If a required logfile isn't found, it tells the +> ^^^^^ +> +> I foresee problems with using _commands_ logging for +> recovery/replication -:(( +> +> Let's consider two concurrent updates in READ COMMITTED mode: +> +> update test set x = 2 where y = 1; +> +> and +> +> update test set x = 3 where y = 1; +> +> The result of both committed transaction will be x = 2 +> if the 1st transaction updated row _after_ 2nd transaction +> and x = 3 if the 2nd transaction gets row after 1st one. +> Order of updates is not defined by order in which commands +> begun and so order in which commands should be rerun +> will be unknown... + + Yepp, the order in which commands begun is absolutely not of + interest. Locking could already delay the execution of one + command until another one started later has finished and + released the lock. It's a classic race condition. + + Thus, my plan was to log the queries just before the call to + CommitTransactionCommand() in tcop. This has the advantage, + that queries which bail out with errors don't get into the + log at all and must not get rerun. And I can set a static + flag to false before starting the command, which is set to + true in the buffer manager when a buffer is written (marked + dirty), so filtering out queries that do no updates at all is + easy. + + Unfortunately query level logging get's hit by the current + implementation of sequence numbers. If a query that get's + aborted somewhere in the middle (maybe by a trigger) called + nextval() for rows processed earlier, the sequence number + isn't advanced at recovery time, because the query is + suppressed at all. And sequences aren't locked, so for + concurrently running queries getting numbers from the same + sequence, the results aren't reproduceable. If some + application selects a value resulting from a sequence and + uses that later in another query, how could the redolog know + that this has changed? It's a Const in the query logged, and + all that corrupts the whole thing. + + All that is painful and I don't see another solution yet than + to hook into nextval(), log out the numbers generated in + normal operation and getting back the same numbers in redo + mode. + + The whole thing gets more and more complicated :-( + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + diff --git a/doc/TODO.detail/memory b/doc/TODO.detail/memory new file mode 100644 index 0000000000..cda162aeb8 --- /dev/null +++ b/doc/TODO.detail/memory @@ -0,0 +1,1240 @@ +From owner-pgsql-hackers@hub.org Thu Nov 26 08:31:13 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id IAA24423 + for ; Thu, 26 Nov 1998 08:31:08 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id IAA04554 for ; Thu, 26 Nov 1998 08:04:30 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id HAA03761; + Thu, 26 Nov 1998 07:56:37 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 26 Nov 1998 07:55:28 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id HAA03689 + for pgsql-hackers-outgoing; Thu, 26 Nov 1998 07:55:26 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from orion.SAPserv.Hamburg.dsh.de (Tpolaris2.sapham.debis.de [53.2.131.8]) + by hub.org (8.9.1/8.9.1) with SMTP id HAA03674 + for ; Thu, 26 Nov 1998 07:55:19 -0500 (EST) + (envelope-from wieck@sapserv.debis.de) +Received: by orion.SAPserv.Hamburg.dsh.de + for pgsql-hackers@postgreSQL.org + id m0zj13G-000EBfC; Thu, 26 Nov 98 14:01 MET +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] Re: memory leak with Abort Transaction +To: takehi-s@ascii.co.jp (SHIOZAKI Takehiko) +Date: Thu, 26 Nov 1998 14:01:42 +0100 (MET) +Cc: pgsql-hackers@postgreSQL.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <199811261240.VAA27516@libpc01.pb.ascii.co.jp> from "SHIOZAKI Takehiko" at Nov 26, 98 09:40:19 pm +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +SHIOZAKI Takehiko wrote: + +> +> Hello! +> +> Releasing 6.4.1 is a good news. +> But would you confirm the following "memory leak" problem? +> It is reproducable on 6.4 (FreeBSD 2.2.7-RELEASE). + + It's an far too old problem. And as far as I remember, there + are different locations in the code causing it. + + One place I remember well. It's in the tcop mainloop in + PostgresMain(). The querytree list is malloc()'ed (there and + in the parser) and free()'d after the query is processed - + except the processing of the queries bails out with elog(). + In that case it never runs over the free() because the + longjmp() kick's it back to the beginning of the loop. + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + +From owner-pgsql-hackers@hub.org Fri Mar 19 16:01:29 1999 +Received: from hub.org (majordom@hub.org [209.47.145.100]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id QAA05828 + for ; Fri, 19 Mar 1999 16:01:22 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id PAA15701; + Fri, 19 Mar 1999 15:59:51 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 19 Mar 1999 15:59:08 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id PAA15551 + for pgsql-hackers-outgoing; Fri, 19 Mar 1999 15:59:05 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from andrew.cmu.edu (ANDREW.CMU.EDU [128.2.10.101]) + by hub.org (8.9.2/8.9.1) with ESMTP id PAA15524 + for ; Fri, 19 Mar 1999 15:58:53 -0500 (EST) + (envelope-from er1p+@andrew.cmu.edu) +Received: (from postman@localhost) by andrew.cmu.edu (8.8.5/8.8.2) id PAA29323 for pgsql-hackers@postgresql.org; Fri, 19 Mar 1999 15:58:50 -0500 (EST) +Received: via switchmail; Fri, 19 Mar 1999 15:58:50 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Fri, 19 Mar 1999 15:58:37 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Fri, 19 Mar 1999 15:58:31 -0500 (EST) +Received: from mms.4.60.Jun.27.1996.03.05.56.sun4.41.EzMail.2.0.CUILIB.3.45.SNAP.NOT.LINKED.cloudy.me.cmu.edu.sun4m.412 + via MS.5.6.cloudy.me.cmu.edu.sun4_41; + Fri, 19 Mar 1999 15:58:29 -0500 (EST) +Message-ID: +Date: Fri, 19 Mar 1999 15:58:29 -0500 (EST) +From: Erik Riedel +To: pgsql-hackers@postgreSQL.org +Subject: [HACKERS] aggregation memory leak and fix +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + + +Platform: Alpha, Digital UNIX 4.0D +Software: PostgreSQL 6.4.2 and 6.5 snaphot (11 March 1999) + +I have a table as follows: + +Table = lineitem ++------------------------+----------------------------------+-------+ +| Field | Type | Length| ++------------------------+----------------------------------+-------+ +| l_orderkey | int4 not null | 4 | +| l_partkey | int4 not null | 4 | +| l_suppkey | int4 not null | 4 | +| l_linenumber | int4 not null | 4 | +| l_quantity | float4 not null | 4 | +| l_extendedprice | float4 not null | 4 | +| l_discount | float4 not null | 4 | +| l_tax | float4 not null | 4 | +| l_returnflag | char() not null | 1 | +| l_linestatus | char() not null | 1 | +| l_shipdate | date | 4 | +| l_commitdate | date | 4 | +| l_receiptdate | date | 4 | +| l_shipinstruct | char() not null | 25 | +| l_shipmode | char() not null | 10 | +| l_comment | char() not null | 44 | ++------------------------+----------------------------------+-------+ +Index: lineitem_index_ + +that ends up having on the order of 500,000 rows (about 100 MB on disk). + +I then run an aggregation query as: + +-- +-- Query 1 +-- +select l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, +sum(l_extendedprice) as sum_base_price, +sum(l_extendedprice*(1-l_discount)) as sum_disc_price, +sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge, +avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, +avg(l_discount) as avg_disc, count(*) as count_order +from lineitem +where l_shipdate <= ('1998-12-01'::datetime - interval '90 day')::date +group by l_returnflag, l_linestatus +order by l_returnflag, l_linestatus; + + +when I run this against 6.4.2, the postgres process grows to upwards of +1 GB of memory (at which point something overflows and it dumps core) - +I watch it grow through 200 MB, 400 MB, 800 MB, dies somewhere near 1 GB +of allocated memory). + +If I take out a few of the "sum" expressions it gets better, removing +sum_disk_price and sum_charge causes it to be only 600 MB and the query +actually (eventually) completes. Takes about 10 minutes on my 500 MHz +machine with 256 MB core and 4 GB of swap. + +The problem seems to be the memory allocation mechanism. Looking at a +call trace, it is doing some kind of "sub query" plan for each row in +the database. That means it does ExecEval and postquel_function and +postquel_execute and all their friends for each row in the database. +Allocating a couple hundred bytes for each one. + +The problem is that none of these allocations are freed - they seem to +depend on the AllocSet to free them at the end of the transaction. This +means it isn't a "true" leak, because the bytes are all freed at the +(very) end of the transaction, but it does mean that the process grows +to unreasonable size in the meantime. There is no need for this, +because the individual expression results are aggregated as it goes +along, so the intermediate nodes can be freed. + +I spent half a day last week chasing down the offending palloc() calls +and execution stacks sufficiently that I think I found the right places +to put pfree() calls. + +As a result, I have changes in the files: + +src/backend/executor/execUtils.c +src/backend/executor/nodeResult.c +src/backend/executor/nodeAgg.c +src/backend/executor/execMain.c + +patches to these files are attached at the end of this message. These +files are based on the 6.5.0 snapshot downloaded from ftp.postgreql.org +on 11 March 1999. + +Apologies for sending patches to a non-released version. If anyone has +problems applying the patches, I can send the full files (I wanted to +avoid sending a 100K shell archive to the list). If anyone cares about +reproducing my exact problem with the above table, I can provide the 100 +MB pg_dump file for download as well. + +Secondary Issue: the reason I did not use the 6.4.2 code to make my +changes is because the AllocSet calls in that one were particularly +egregious - they only had the skeleton of the allocsets code that exists +in the 6.5 snapshots, so they were calling malloc() for all of the 8 and +16 byte allocations that the above query causes. + +Using the fixed code reduces the maximum memory requirement on the above +query to about 210 MB, and reduces the runtime to (an acceptable) 1.5 +minutes - a factor of more than 6x improvement on my 256 MB machine. + +Now the biggest part of the execution time is in the sort before the +aggregation (which isn't strictly needed, but that is an optimization +for another day). + +Open Issue: there is still a small "leak" that I couldn't eliminate, I +think I chased it down to the constvalue allocated in +execQual::ExecTargetList(), but I couldn't figure out where to properly +free it. 8 bytes leaked was much better than 750 bytes, so I stopped +banging my head on that particular item. + +Secondary Open Issue: what I did have to do to get down to 210 MB of +core was reduce the minimum allocation size in AllocSet to 8 bytes from +16 bytes. That reduces the 8 byte leak above to a true 8 byte, rather +than a 16 byte leak. Otherwise, I think the size was 280 MB (still a +big improvement on 1000+ MB). I only changed this in my code and I am +not including a changed mcxt.c for that. + +I hope my changes are understandable/reasonable. Enjoy. + +Erik Riedel +Carnegie Mellon University +www.cs.cmu.edu/~riedel + +--------------[aggregation_memory_patch.sh]----------------------- + +#! /bin/sh +# This is a shell archive, meaning: +# 1. Remove everything above the #! /bin/sh line. +# 2. Save the resulting text in a file. +# 3. Execute the file with /bin/sh (not csh) to create: +# execMain.c.diff +# execUtils.c.diff +# nodeAgg.c.diff +# nodeResult.c.diff +# This archive created: Fri Mar 19 15:47:17 1999 +export PATH; PATH=/bin:/usr/bin:$PATH +if test -f 'execMain.c.diff' +then + echo shar: "will not over-write existing file 'execMain.c.diff'" +else +cat << \SHAR_EOF > 'execMain.c.diff' +583c + +. +398a + +. +396a + /* XXX - clean up some more from ExecutorStart() - er1p */ + if (NULL == estate->es_snapshot) { + /* nothing to free */ + } else { + if (estate->es_snapshot->xcnt > 0) { + pfree(estate->es_snapshot->xip); + } + pfree(estate->es_snapshot); + } + + if (NULL == estate->es_param_exec_vals) { + /* nothing to free */ + } else { + pfree(estate->es_param_exec_vals); + estate->es_param_exec_vals = NULL; + } + +. +SHAR_EOF +fi +if test -f 'execUtils.c.diff' +then + echo shar: "will not over-write existing file 'execUtils.c.diff'" +else +cat << \SHAR_EOF > 'execUtils.c.diff' +368a +} + +/* ---------------- + * ExecFreeExprContext + * ---------------- + */ +void +ExecFreeExprContext(CommonState *commonstate) +{ + ExprContext *econtext; + + /* ---------------- + * get expression context. if NULL then this node has + * none so we just return. + * ---------------- + */ + econtext = commonstate->cs_ExprContext; + if (econtext == NULL) + return; + + /* ---------------- + * clean up memory used. + * ---------------- + */ + pfree(econtext); + commonstate->cs_ExprContext = NULL; +} + +/* ---------------- + * ExecFreeTypeInfo + * ---------------- + */ +void +ExecFreeTypeInfo(CommonState *commonstate) +{ + TupleDesc tupDesc; + + tupDesc = commonstate->cs_ResultTupleSlot->ttc_tupleDescriptor; + if (tupDesc == NULL) + return; + + /* ---------------- + * clean up memory used. + * ---------------- + */ + FreeTupleDesc(tupDesc); + commonstate->cs_ResultTupleSlot->ttc_tupleDescriptor = NULL; +. +274a + +. +SHAR_EOF +fi +if test -f 'nodeAgg.c.diff' +then + echo shar: "will not over-write existing file 'nodeAgg.c.diff'" +else +cat << \SHAR_EOF > 'nodeAgg.c.diff' +376a + pfree(oldVal); /* XXX - new, let's free the old datum - er1p */ +. +374a + oldVal = value1[aggno]; /* XXX - save so we can free later - er1p */ +. +112a + Datum oldVal = (Datum) NULL; /* XXX - so that we can save and free on +each iteration - er1p */ +. +SHAR_EOF +fi +if test -f 'nodeResult.c.diff' +then + echo shar: "will not over-write existing file 'nodeResult.c.diff'" +else +cat << \SHAR_EOF > 'nodeResult.c.diff' +278a + pfree(resstate); node->resstate = NULL; /* XXX - new for us - er1p */ +. +265a + ExecFreeExprContext(&resstate->cstate); /* XXX - new for us - er1p */ + ExecFreeTypeInfo(&resstate->cstate); /* XXX - new for us - er1p */ +. +SHAR_EOF +fi +exit 0 +# End of shell archive + + + +From er1p+@andrew.cmu.edu Fri Mar 19 19:43:27 1999 +Received: from po8.andrew.cmu.edu (PO8.ANDREW.CMU.EDU [128.2.10.108]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA09183 + for ; Fri, 19 Mar 1999 19:43:26 -0500 (EST) +Received: (from postman@localhost) by po8.andrew.cmu.edu (8.8.5/8.8.2) id TAA11773; Fri, 19 Mar 1999 19:43:18 -0500 (EST) +Received: via switchmail; Fri, 19 Mar 1999 19:43:18 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Fri, 19 Mar 1999 19:43:05 -0500 (EST) +Received: from mms.4.60.Jun.27.1996.03.05.56.sun4.41.EzMail.2.0.CUILIB.3.45.SNAP.NOT.LINKED.cloudy.me.cmu.edu.sun4m.412 + via MS.5.6.cloudy.me.cmu.edu.sun4_41; + Fri, 19 Mar 1999 19:43:02 -0500 (EST) +Message-ID: +Date: Fri, 19 Mar 1999 19:43:02 -0500 (EST) +From: Erik Riedel +To: Bruce Momjian +Subject: Re: [HACKERS] aggregation memory leak and fix +Cc: pgsql-hackers@postgreSQL.org +In-Reply-To: <199903192223.RAA06691@candle.pha.pa.us> +References: <199903192223.RAA06691@candle.pha.pa.us> +Status: ROr + + +> No apologies necessary. Glad to have someone digging into that area of +> the code. We will gladly apply your patches to 6.5. However, I request +> that you send context diffs(diff -c). Normal diffs are just too +> error-prone in application. Send them, and I will apply them right +> away. +> +Context diffs attached. This was due to my ignorance of diff. When I +made the other files, I though "hmm, these could be difficult to apply +if the code has changed a bit, wouldn't it be good if they included a +few lines before and after the fix". Now I know "-c". + +> Not sure why that is there? Perhaps for GROUP BY processing? +> +Right, it is a result of the Group processing requiring sorted input. +Just that it doesn't "require" sorted input, it "could" be a little more +flexible and the sort wouldn't be necessary. Essentially this would be +a single "AggSort" node that did the aggregation while sorting (probably +with replacement selection rather than quicksort). This definitely +would require some code/smarts that isn't there today. + +> > think I chased it down to the constvalue allocated in +> > execQual::ExecTargetList(), but I couldn't figure out where to properly +> > free it. 8 bytes leaked was much better than 750 bytes, so I stopped +> > banging my head on that particular item. +> +> Can you give me the exact line? Is it the palloc(1)? +> +No, the 8 bytes seem to come from the ExecEvalExpr() call near line +1530. Problem was when I tried to free these, I got "not in AllocSet" +errors, so something more complicated was going on. + +Thanks. + +Erik + +-----------[aggregation_memory_patch.sh]---------------------- + +#! /bin/sh +# This is a shell archive, meaning: +# 1. Remove everything above the #! /bin/sh line. +# 2. Save the resulting text in a file. +# 3. Execute the file with /bin/sh (not csh) to create: +# execMain.c.diff +# execUtils.c.diff +# nodeAgg.c.diff +# nodeResult.c.diff +# This archive created: Fri Mar 19 19:35:42 1999 +export PATH; PATH=/bin:/usr/bin:$PATH +if test -f 'execMain.c.diff' +then + echo shar: "will not over-write existing file 'execMain.c.diff'" +else +cat << \SHAR_EOF > 'execMain.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/executor/ +execMain.c Thu Mar 11 23:59:11 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/612/src/backend/executor/ +execMain.c Fri Mar 19 15:03:28 1999 +*************** +*** 394,401 **** +--- 394,419 ---- + + EndPlan(queryDesc->plantree, estate); + ++ /* XXX - clean up some more from ExecutorStart() - er1p */ ++ if (NULL == estate->es_snapshot) { ++ /* nothing to free */ ++ } else { ++ if (estate->es_snapshot->xcnt > 0) { ++ pfree(estate->es_snapshot->xip); ++ } ++ pfree(estate->es_snapshot); ++ } ++ ++ if (NULL == estate->es_param_exec_vals) { ++ /* nothing to free */ ++ } else { ++ pfree(estate->es_param_exec_vals); ++ estate->es_param_exec_vals = NULL; ++ } ++ + /* restore saved refcounts. */ + BufferRefCountRestore(estate->es_refcount); ++ + } + + void +*************** +*** 580,586 **** + /* + * initialize result relation stuff + */ +! + if (resultRelation != 0 && operation != CMD_SELECT) + { + /* +--- 598,604 ---- + /* + * initialize result relation stuff + */ +! + if (resultRelation != 0 && operation != CMD_SELECT) + { + /* +SHAR_EOF +fi +if test -f 'execUtils.c.diff' +then + echo shar: "will not over-write existing file 'execUtils.c.diff'" +else +cat << \SHAR_EOF > 'execUtils.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/executor/ +execUtils.c Thu Mar 11 23:59:11 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/612/src/backend/executor/ +execUtils.c Fri Mar 19 14:55:59 1999 +*************** +*** 272,277 **** +--- 272,278 ---- + #endif + i++; + } ++ + if (len > 0) + { + ExecAssignResultType(commonstate, +*************** +*** 366,371 **** +--- 367,419 ---- + + pfree(projInfo); + commonstate->cs_ProjInfo = NULL; ++ } ++ ++ /* ---------------- ++ * ExecFreeExprContext ++ * ---------------- ++ */ ++ void ++ ExecFreeExprContext(CommonState *commonstate) ++ { ++ ExprContext *econtext; ++ ++ /* ---------------- ++ * get expression context. if NULL then this node has ++ * none so we just return. ++ * ---------------- ++ */ ++ econtext = commonstate->cs_ExprContext; ++ if (econtext == NULL) ++ return; ++ ++ /* ---------------- ++ * clean up memory used. ++ * ---------------- ++ */ ++ pfree(econtext); ++ commonstate->cs_ExprContext = NULL; ++ } ++ ++ /* ---------------- ++ * ExecFreeTypeInfo ++ * ---------------- ++ */ ++ void ++ ExecFreeTypeInfo(CommonState *commonstate) ++ { ++ TupleDesc tupDesc; ++ ++ tupDesc = commonstate->cs_ResultTupleSlot->ttc_tupleDescriptor; ++ if (tupDesc == NULL) ++ return; ++ ++ /* ---------------- ++ * clean up memory used. ++ * ---------------- ++ */ ++ FreeTupleDesc(tupDesc); ++ commonstate->cs_ResultTupleSlot->ttc_tupleDescriptor = NULL; + } + + /* ---------------------------------------------------------------- +SHAR_EOF +fi +if test -f 'nodeAgg.c.diff' +then + echo shar: "will not over-write existing file 'nodeAgg.c.diff'" +else +cat << \SHAR_EOF > 'nodeAgg.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/executor/ +nodeAgg.c Thu Mar 11 23:59:11 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/612/src/backend/executor/ +nodeAgg.c Fri Mar 19 15:01:21 1999 +*************** +*** 110,115 **** +--- 110,116 ---- + isNull2 = FALSE; + bool qual_result; + ++ Datum oldVal = (Datum) NULL; /* XXX - so that we can save and free +on each iteration - er1p */ + + /* --------------------- + * get state info from node +*************** +*** 372,379 **** +--- 373,382 ---- + */ + args[0] = value1[aggno]; + args[1] = newVal; ++ oldVal = value1[aggno]; /* XXX - save so we can free later - er1p */ + value1[aggno] = (Datum) fmgr_c(&aggfns->xfn1, + (FmgrValues *) args, &isNull1); ++ pfree(oldVal); /* XXX - new, let's free the old datum - er1p */ + Assert(!isNull1); + } + } +SHAR_EOF +fi +if test -f 'nodeResult.c.diff' +then + echo shar: "will not over-write existing file 'nodeResult.c.diff'" +else +cat << \SHAR_EOF > 'nodeResult.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/executor/ +nodeResult.c Thu Mar 11 23:59:12 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/612/src/backend/executor/ +nodeResult.c Fri Mar 19 14:57:26 1999 +*************** +*** 263,268 **** +--- 263,270 ---- + * is freed at end-transaction time. -cim 6/2/91 + * ---------------- + */ ++ ExecFreeExprContext(&resstate->cstate); /* XXX - new for us - er1p */ ++ ExecFreeTypeInfo(&resstate->cstate); /* XXX - new for us - er1p */ + ExecFreeProjectionInfo(&resstate->cstate); + + /* ---------------- +*************** +*** 276,281 **** +--- 278,284 ---- + * ---------------- + */ + ExecClearTuple(resstate->cstate.cs_ResultTupleSlot); ++ pfree(resstate); node->resstate = NULL; /* XXX - new for us - er1p */ + } + + void +SHAR_EOF +fi +exit 0 +# End of shell archive + + +From owner-pgsql-hackers@hub.org Fri Mar 19 21:01:15 1999 +Received: from hub.org (majordom@hub.org [209.47.145.100]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id VAA11368 + for ; Fri, 19 Mar 1999 21:01:13 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id UAA40887; + Fri, 19 Mar 1999 20:59:47 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 19 Mar 1999 20:58:14 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id UAA40637 + for pgsql-hackers-outgoing; Fri, 19 Mar 1999 20:58:12 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from candle.pha.pa.us (maillist@s5-03.ppp.op.net [209.152.195.67]) + by hub.org (8.9.2/8.9.1) with ESMTP id UAA40620 + for ; Fri, 19 Mar 1999 20:58:02 -0500 (EST) + (envelope-from maillist@candle.pha.pa.us) +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id UAA11263; + Fri, 19 Mar 1999 20:58:00 -0500 (EST) +From: Bruce Momjian +Message-Id: <199903200158.UAA11263@candle.pha.pa.us> +Subject: Re: [HACKERS] aggregation memory leak and fix +In-Reply-To: from Erik Riedel at "Mar 19, 1999 7:43: 2 pm" +To: riedel+@CMU.EDU (Erik Riedel) +Date: Fri, 19 Mar 1999 20:58:00 -0500 (EST) +Cc: pgsql-hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +> +> > No apologies necessary. Glad to have someone digging into that area of +> > the code. We will gladly apply your patches to 6.5. However, I request +> > that you send context diffs(diff -c). Normal diffs are just too +> > error-prone in application. Send them, and I will apply them right +> > away. +> > +> Context diffs attached. This was due to my ignorance of diff. When I +> made the other files, I though "hmm, these could be difficult to apply +> if the code has changed a bit, wouldn't it be good if they included a +> few lines before and after the fix". Now I know "-c". + +Applied. + +> > Not sure why that is there? Perhaps for GROUP BY processing? +> > +> Right, it is a result of the Group processing requiring sorted input. +> Just that it doesn't "require" sorted input, it "could" be a little more +> flexible and the sort wouldn't be necessary. Essentially this would be +> a single "AggSort" node that did the aggregation while sorting (probably +> with replacement selection rather than quicksort). This definitely +> would require some code/smarts that isn't there today. + +I think you will find make_groupPlan adds the sort as needed by the +GROUP BY. I assume you are suggesting to do the aggregate/GROUP on unsorted +data, which is hard to do in a flexible way. + +> > > think I chased it down to the constvalue allocated in +> > > execQual::ExecTargetList(), but I couldn't figure out where to properly +> > > free it. 8 bytes leaked was much better than 750 bytes, so I stopped +> > > banging my head on that particular item. +> > +> > Can you give me the exact line? Is it the palloc(1)? +> > +> No, the 8 bytes seem to come from the ExecEvalExpr() call near line +> 1530. Problem was when I tried to free these, I got "not in AllocSet" +> errors, so something more complicated was going on. + +Yes, if you look inside ExecEvalExpr(), you will see it tries to get a +value for the expression(Datum). It may return an int, float4, or a +string. In the last case, that is actually a pointer and not a specific +value. + +So, in some cases, the value can just be thrown away, or it may be a +pointer to memory that can be freed after the call to heap_formtuple() +later in the function. The trick is to find the function call in +ExecEvalExpr() that is allocating something, and conditionally free +values[] after the call to heap_formtuple(). If you don't want find it, +perhaps you can send me enough info so I can see it here. + +I wonder whether it is the call to CreateTupleDescCopy() inside +ExecEvalVar()? + +Another problem I just fixed is that fjIsNull was not being pfree'ed if +it was used with >64 targets, but I don't think that affects you. + +I also assume you have run your recent patch through the the +test/regression tests, so see it does not cause some other area to fail, +right? + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + + +From owner-pgsql-hackers@hub.org Sat Mar 20 12:01:44 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id MAA24855 + for ; Sat, 20 Mar 1999 12:01:43 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id LAA11985 for ; Sat, 20 Mar 1999 11:58:48 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id LAA12367; + Sat, 20 Mar 1999 11:57:17 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sat, 20 Mar 1999 11:55:22 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id LAA12026 + for pgsql-hackers-outgoing; Sat, 20 Mar 1999 11:55:17 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.2/8.9.1) with ESMTP id LAA11871 + for ; Sat, 20 Mar 1999 11:54:57 -0500 (EST) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id LAA28068; + Sat, 20 Mar 1999 11:48:58 -0500 (EST) +To: Bruce Momjian +cc: riedel+@CMU.EDU (Erik Riedel), pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] aggregation memory leak and fix +In-reply-to: Your message of Fri, 19 Mar 1999 21:33:33 -0500 (EST) + <199903200233.VAA11816@candle.pha.pa.us> +Date: Sat, 20 Mar 1999 11:48:58 -0500 +Message-ID: <28066.921948538@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Bruce Momjian writes: +> My only quick solution would seem to be to add a new "expression" memory +> context, that can be cleared after every tuple is processed, clearing +> out temporary values allocated inside an expression. + +Right, this whole problem of growing backend memory use during a large +SELECT (or COPY, or probably a few other things) is one of the things +that we were talking about addressing by revising the memory management +structure. + +I think what we want inside the executor is a distinction between +storage that must live to the end of the statement and storage that is +only needed while processing the current tuple. The second kind of +storage would go into a separate context that gets flushed every so +often. (It could be every tuple, or every dozen or hundred tuples +depending on what seems the best tradeoff of cycles against memory +usage.) + +I'm not sure that just two contexts is enough, either. For example in + SELECT field1, SUM(field2) GROUP BY field1; +the working memory for the SUM aggregate could not be released after +each tuple, but perhaps we don't want it to live for the whole statement +either --- in that case we'd need a per-group context. (This particular +example isn't very convincing, because the same storage for the SUM +*could* be recycled from group to group. But I don't know whether it +actually *is* reused or not. If fresh storage is palloc'd for each +instantiation of SUM then we have a per-group leak in this scenario. +In any case, I'm not sure all aggregate functions have constant memory +requirements that would let them recycle storage across groups.) + +What we need to do is work out what the best set of memory context +definitions is, and then decide on a strategy for making sure that +lower-level routines allocate their return values in the right context. +It'd be nice if the lower-level routines could still call palloc() and +not have to worry about this explicitly --- otherwise we'll break not +only a lot of our own code but perhaps a lot of user code. (User- +specific data types and SPI code all use palloc, no?) + +I think it is too late to try to fix this for 6.5, but it ought to be a +top priority for 6.6. + + regards, tom lane + + +From tgl@sss.pgh.pa.us Sun Mar 21 16:01:46 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id QAA00139 + for ; Sun, 21 Mar 1999 16:01:45 -0500 (EST) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id PAA27737 for ; Sun, 21 Mar 1999 15:52:38 -0500 (EST) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id PAA14946; + Sun, 21 Mar 1999 15:50:20 -0500 (EST) +To: Bruce Momjian +cc: pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] aggregation memory leak and fix +In-reply-to: Your message of Sun, 21 Mar 1999 14:20:39 -0500 (EST) + <199903211920.OAA28744@candle.pha.pa.us> +Date: Sun, 21 Mar 1999 15:50:20 -0500 +Message-ID: <14944.922049420@sss.pgh.pa.us> +From: Tom Lane +Status: ROr + +Bruce Momjian writes: +>> What we need to do is work out what the best set of memory context +>> definitions is, and then decide on a strategy for making sure that +>> lower-level routines allocate their return values in the right context. + +> Let's suppose that we want to free all the memory used as expression +> intermediate values after each row is processed. +> It is my understanding that all these are created in utils/adt/*.c +> files, and that the entry point to all those functions via +> fmgr()/fmgr_c(). + +That's probably the bulk of the specific calls of palloc(). Someone +(Jan?) did a scan of the code a while ago looking for palloc() calls, +and there aren't that many outside of the data-type-specific functions. +But we'd have to look individually at all the ones that are elsewhere. + +> So, if we go into an expression memory context before calling +> fmgr/fmgr_c in the executor, and return to the normal context after the +> function call, all our intermediates are trapped in the expression +> memory context. + +OK, so you're saying we leave the data-type-specific functions as is +(calling palloc() to allocate their result areas), and make each call +site specifically responsible for setting the context that palloc() will +allocate from? That could work, I think. We'd need to see what side +effects it'd have on other uses of palloc(). + +What we'd probably want is to use a stack discipline for the current +palloc-target memory context: when you set the context, you get back the +ID of the old context, and you are supposed to restore that old context +before returning. + +> At the end of each row, we just free the expression memory context. In +> almost all cases, the data is stored in tuples, and we can free it. In +> a few cases like aggregates, we have to save off the value we need to +> keep before freeing the expression context. + +Actually, nodeAgg would just have to set an appropriate context before +calling fmgr to execute the aggregate's transition functions, and then +it wouldn't need an extra copy step. The results would come back in the +right context already. + +> In fact, you could even optimize the cleanup to only do free'ing if +> some expression memory was allocated. In most cases, it is not. + +Jan's stuff should already fall through pretty quickly if there's +nothing in the context, I think. Note that what we want to do between +tuples is a "context clear" of the expression context, not a "context +delete" and then "context create" a new expression context. Context +clear should be a pretty quick no-op if nothing's been allocated in that +context... + +> In fact the nodeAgg.c patch that I backed out attempted to do that, +> though because there wasn't code that checked if the Datum was +> pg_type.typbyval, it didn't work 100%. + +Right. But if we approach it this way (clear the context at appropriate +times) rather than thinking in terms of explicitly pfree'ing individual +objects, life gets much simpler. Also, if we insist on being able to +pfree individual objects inside a context, we can't use Jan's faster +allocator! Remember, the reason it is faster and lower overhead is that +it doesn't keep track of individual objects, only pools. + +I'd like to see us head in the direction of removing most of the +explicit pfree calls that exist now, and instead rely on clearing +memory contexts at appropriate times in order to manage memory. +The fewer places where we need pfree, the more contexts can be run +with the low-overhead space allocator. Also, the fewer explicit +pfrees we need, the simpler and more reliable the code gets. + + regards, tom lane + +From owner-pgsql-hackers@hub.org Sun Mar 21 16:01:49 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id QAA00149 + for ; Sun, 21 Mar 1999 16:01:48 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id PAA27950 for ; Sun, 21 Mar 1999 15:56:07 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id PAA39413; + Sun, 21 Mar 1999 15:54:51 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 21 Mar 1999 15:54:31 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id PAA39249 + for pgsql-hackers-outgoing; Sun, 21 Mar 1999 15:54:27 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.2/8.9.1) with ESMTP id PAA39235 + for ; Sun, 21 Mar 1999 15:54:21 -0500 (EST) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id PAA14946; + Sun, 21 Mar 1999 15:50:20 -0500 (EST) +To: Bruce Momjian +cc: pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] aggregation memory leak and fix +In-reply-to: Your message of Sun, 21 Mar 1999 14:20:39 -0500 (EST) + <199903211920.OAA28744@candle.pha.pa.us> +Date: Sun, 21 Mar 1999 15:50:20 -0500 +Message-ID: <14944.922049420@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Bruce Momjian writes: +>> What we need to do is work out what the best set of memory context +>> definitions is, and then decide on a strategy for making sure that +>> lower-level routines allocate their return values in the right context. + +> Let's suppose that we want to free all the memory used as expression +> intermediate values after each row is processed. +> It is my understanding that all these are created in utils/adt/*.c +> files, and that the entry point to all those functions via +> fmgr()/fmgr_c(). + +That's probably the bulk of the specific calls of palloc(). Someone +(Jan?) did a scan of the code a while ago looking for palloc() calls, +and there aren't that many outside of the data-type-specific functions. +But we'd have to look individually at all the ones that are elsewhere. + +> So, if we go into an expression memory context before calling +> fmgr/fmgr_c in the executor, and return to the normal context after the +> function call, all our intermediates are trapped in the expression +> memory context. + +OK, so you're saying we leave the data-type-specific functions as is +(calling palloc() to allocate their result areas), and make each call +site specifically responsible for setting the context that palloc() will +allocate from? That could work, I think. We'd need to see what side +effects it'd have on other uses of palloc(). + +What we'd probably want is to use a stack discipline for the current +palloc-target memory context: when you set the context, you get back the +ID of the old context, and you are supposed to restore that old context +before returning. + +> At the end of each row, we just free the expression memory context. In +> almost all cases, the data is stored in tuples, and we can free it. In +> a few cases like aggregates, we have to save off the value we need to +> keep before freeing the expression context. + +Actually, nodeAgg would just have to set an appropriate context before +calling fmgr to execute the aggregate's transition functions, and then +it wouldn't need an extra copy step. The results would come back in the +right context already. + +> In fact, you could even optimize the cleanup to only do free'ing if +> some expression memory was allocated. In most cases, it is not. + +Jan's stuff should already fall through pretty quickly if there's +nothing in the context, I think. Note that what we want to do between +tuples is a "context clear" of the expression context, not a "context +delete" and then "context create" a new expression context. Context +clear should be a pretty quick no-op if nothing's been allocated in that +context... + +> In fact the nodeAgg.c patch that I backed out attempted to do that, +> though because there wasn't code that checked if the Datum was +> pg_type.typbyval, it didn't work 100%. + +Right. But if we approach it this way (clear the context at appropriate +times) rather than thinking in terms of explicitly pfree'ing individual +objects, life gets much simpler. Also, if we insist on being able to +pfree individual objects inside a context, we can't use Jan's faster +allocator! Remember, the reason it is faster and lower overhead is that +it doesn't keep track of individual objects, only pools. + +I'd like to see us head in the direction of removing most of the +explicit pfree calls that exist now, and instead rely on clearing +memory contexts at appropriate times in order to manage memory. +The fewer places where we need pfree, the more contexts can be run +with the low-overhead space allocator. Also, the fewer explicit +pfrees we need, the simpler and more reliable the code gets. + + regards, tom lane + + +From owner-pgsql-hackers@hub.org Wed Mar 24 19:10:53 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id TAA00906 + for ; Wed, 24 Mar 1999 19:10:52 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA24258 for ; Wed, 24 Mar 1999 13:09:47 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id NAA60743; + Wed, 24 Mar 1999 13:07:26 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 24 Mar 1999 13:06:47 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id NAA60556 + for pgsql-hackers-outgoing; Wed, 24 Mar 1999 13:06:43 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from po7.andrew.cmu.edu (PO7.ANDREW.CMU.EDU [128.2.10.107]) + by hub.org (8.9.2/8.9.1) with ESMTP id NAA60540 + for ; Wed, 24 Mar 1999 13:06:25 -0500 (EST) + (envelope-from er1p+@andrew.cmu.edu) +Received: (from postman@localhost) by po7.andrew.cmu.edu (8.8.5/8.8.2) id NAA06323; Wed, 24 Mar 1999 13:06:16 -0500 (EST) +Received: via switchmail; Wed, 24 Mar 1999 13:06:16 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Wed, 24 Mar 1999 13:06:02 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Wed, 24 Mar 1999 13:06:00 -0500 (EST) +Received: from mms.4.60.Jun.27.1996.03.05.56.sun4.41.EzMail.2.0.CUILIB.3.45.SNAP.NOT.LINKED.cloudy.me.cmu.edu.sun4m.412 + via MS.5.6.cloudy.me.cmu.edu.sun4_41; + Wed, 24 Mar 1999 13:05:58 -0500 (EST) +Message-ID: +Date: Wed, 24 Mar 1999 13:05:58 -0500 (EST) +From: Erik Riedel +To: Bruce Momjian +Subject: Re: [HACKERS] aggregation memory leak and fix +Cc: pgsql-hackers@postgreSQL.org +In-Reply-To: <199903240611.BAA01206@candle.pha.pa.us> +References: <199903240611.BAA01206@candle.pha.pa.us> +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + + +> I am interested to see if it fixes the expression leak you saw. I have +> not committed this yet. I want to look at it some more. +> +I'm afraid that this doesn't seem to have any effect on my query. + +Looking at your code, I think the problem is that most of the +allocations in my query are on the top part of the if statement that +you modified (i.e. the == SQLlanguageId part). Below is a snippet of +a trace from my query, with approximate line numbers for execQual.c +with your patch applied: + +(execQual) language == SQLlanguageId (execQual.c:757) +(execQual) execute postquel_function (execQual.c:759) +(mcxt) MemoryContextAlloc 32 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 16 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 528 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 56 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 88 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 24 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 8 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 65 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 48 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 8 bytes in ** Blank Portal **-heap +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(execQual) exit qual context (execQual.c:862) +(mcxt) MemoryContextAlloc 60 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 16 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 64 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 64 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 528 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 16 bytes +(execQual) return from postquel_function (execQual.c:764) +(execQual) return from ExecEvalFuncArgs (execQual.c:792) +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(execQual) exit qual context (execQual.c:862) +(mcxt) MemoryContextAlloc 108 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 108 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 128 bytes +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(mcxt) MemoryContextAlloc 8 bytes in -heap +(execQual) exit qual context (execQual.c:862) + + + +(execQual) language == SQLlanguageId (execQual.c:757) +(execQual) execute postquel_function (execQual.c:759) +(mcxt) MemoryContextAlloc 32 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 16 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 528 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 56 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 88 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 24 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 8 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 65 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 48 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 8 bytes in ** Blank Portal **-heap +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(execQual) exit qual context (execQual.c:862) +(mcxt) MemoryContextAlloc 60 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 16 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 64 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 64 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 528 bytes +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 16 bytes +(execQual) return from postquel_function (execQual.c:764) +(execQual) return from ExecEvalFuncArgs (execQual.c:792) +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(execQual) exit qual context (execQual.c:862) +(mcxt) MemoryContextAlloc 108 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextAlloc 108 bytes in ** Blank Portal **-heap +(mcxt) MemoryContextFree in ** Blank Portal **-heap freed 128 bytes +(execQual) else clause NOT SQLlanguageId (execQual.c:822) +(execQual) install qual memory context (execQual.c:858) +(mcxt) MemoryContextAlloc 8 bytes in -heap +(execQual) exit qual context (execQual.c:862) + + +the MemoryContext lines give the name of the portal where each +allocation is happening - you see that your Qual manager only captures +a very small number (one) of the allocations, the rest are in the +upper part of the if statement. + +Note that I also placed a printf next to your EndPortalAllocMode() and +StartPortalAllocMode() fix in ExecQual() - I believe this is what is +supposed to clear the portal and free the memory - and that printf +never appears in the above trace. + +Sorry if the trace is a little confusing, but I hope that it helps you +zero in. + +Erik + + + + + + + +From owner-pgsql-hackers@hub.org Sat May 15 23:13:50 1999 +Received: from hub.org (hub.org [209.167.229.1]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id XAA29144 + for ; Sat, 15 May 1999 23:13:49 -0400 (EDT) +Received: from hub.org (hub.org [209.167.229.1]) + by hub.org (8.9.3/8.9.3) with ESMTP id XAA25173; + Sat, 15 May 1999 23:11:03 -0400 (EDT) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sat, 15 May 1999 23:10:29 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.9.3/8.9.3) id XAA25111 + for pgsql-hackers-outgoing; Sat, 15 May 1999 23:10:27 -0400 (EDT) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-hackers@postgreSQL.org using -f +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.3/8.9.3) with ESMTP id XAA25092 + for ; Sat, 15 May 1999 23:10:22 -0400 (EDT) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id XAA17752 + for ; Sat, 15 May 1999 23:09:46 -0400 (EDT) +To: pgsql-hackers@postgreSQL.org +Subject: [HACKERS] Memory leaks in relcache +Date: Sat, 15 May 1999 23:09:46 -0400 +Message-ID: <17750.926824186@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +I have been looking into why a reference to a nonexistent table, eg + INSERT INTO nosuchtable VALUES(1); +leaks a small amount of memory per occurrence. What I find is a +memory leak in the indexscan support. Specifically, +RelationGetIndexScan in backend/access/index/genam.c palloc's both +an IndexScanDesc and some keydata storage. The IndexScanDesc +block is eventually pfree'd, at the bottom of CatalogIndexFetchTuple +in backend/catalog/indexing.c. But the keydata block is not. + +This wouldn't matter so much if the palloc were coming from a +transaction-local context. But what we're doing is a lookup in pg_class +on behalf of RelationBuildDesc in backend/utils/cache/relcache.c, and +it's done a MemoryContextSwitchTo into the global CacheCxt before +starting the lookup. Therefore, the un-pfreed block represents a +permanent memory leak. + +In fact, *every* reference to a relation that is not already present in +the relcache causes a similar leak. The error case is just the one that +is easiest to repeat. The missing pfree of the keydata block is +probably causing a bunch of other short-term and long-term leaks too. + +It seems to me there are two things to fix here: indexscan ought to +pfree everything it pallocs, and RelationBuildDesc ought to be warier +about how much work gets done with CacheCxt as the active palloc +context. (Even if indexscan didn't leak anything ordinarily, there's +still the risk of elog(ERROR) causing an abort before the indexscan code +gets to clean up.) + +Comments? In particular, where is the cleanest place to add the pfree +of the keydata block? I don't especially like the fact that callers +of index_endscan have to clean up the toplevel scan block; I think that +ought to happen inside index_endscan. + + regards, tom lane + + diff --git a/doc/TODO.detail/nulls b/doc/TODO.detail/nulls new file mode 100644 index 0000000000..567bfb5f21 --- /dev/null +++ b/doc/TODO.detail/nulls @@ -0,0 +1,119 @@ +From owner-pgsql-general@hub.org Fri Oct 9 18:22:09 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id SAA04220 + for ; Fri, 9 Oct 1998 18:22:08 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id SAA26960; + Fri, 9 Oct 1998 18:18:29 -0400 (EDT) + (envelope-from owner-pgsql-general@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Fri, 09 Oct 1998 18:18:07 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id SAA26917 + for pgsql-general-outgoing; Fri, 9 Oct 1998 18:18:04 -0400 (EDT) + (envelope-from owner-pgsql-general@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-general@postgreSQL.org using -f +Received: from gecko.statsol.com (gecko.statsol.com [198.11.51.133]) + by hub.org (8.8.8/8.8.8) with ESMTP id SAA26904 + for ; Fri, 9 Oct 1998 18:17:46 -0400 (EDT) + (envelope-from statsol@statsol.com) +Received: from gecko (gecko [198.11.51.133]) + by gecko.statsol.com (8.9.0/8.9.0) with SMTP id SAA00557 + for ; Fri, 9 Oct 1998 18:18:00 -0400 (EDT) +Date: Fri, 9 Oct 1998 18:18:00 -0400 (EDT) +From: Steve Doliov +X-Sender: statsol@gecko +To: pgsql-general@postgreSQL.org +Subject: Re: [GENERAL] Making NULLs visible. +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-general@postgreSQL.org +Precedence: bulk +Status: RO + +On Fri, 9 Oct 1998, Bruce Momjian wrote: + +> [Charset iso-8859-1 unsupported, filtering to ASCII...] +> > > Yes, \ always outputs as \\, excepts someone changed it last week, and I +> > > am requesting a reversal. Do you like the \N if it is unique? +> > +> > Well, it's certainly clear, but could be confused with \n (newline). Can we +> > have \0 instead? +> +> Yes, but it is uppercase. \0 looks like an octal number to me, and I +> think we even output octals sometimes, don't we? +> + +my first suggestion may have been hare-brained, but why not just make the +specifics of the output user-configurable. So if the user chooses \0, so +be it, if the user chooses \N so be it, if the user likes NULL so be it. +but the option would only have one value per database at any given point +in time. so database x could use \N on tuesday and NULL on wednesday, but +database x could never have two references to the characters(s) used to +represent a null value. + +steve + + + + +From owner-pgsql-general@hub.org Sun Oct 11 17:31:08 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id RAA20043 + for ; Sun, 11 Oct 1998 17:31:02 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id RAA03069 for ; Sun, 11 Oct 1998 17:10:34 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id QAA10856; + Sun, 11 Oct 1998 16:57:34 -0400 (EDT) + (envelope-from owner-pgsql-general@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 11 Oct 1998 16:53:35 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id QAA10393 + for pgsql-general-outgoing; Sun, 11 Oct 1998 16:53:34 -0400 (EDT) + (envelope-from owner-pgsql-general@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-general@postgreSQL.org using -f +Received: from mail1.panix.com (mail1.panix.com [166.84.0.212]) + by hub.org (8.8.8/8.8.8) with ESMTP id QAA10378 + for ; Sun, 11 Oct 1998 16:53:28 -0400 (EDT) + (envelope-from tomg@admin.nrnet.org) +Received: from mailhost.nrnet.org (root@mailhost.nrnet.org [166.84.192.39]) + by mail1.panix.com (8.8.8/8.8.8/PanixM1.3) with ESMTP id QAA16311 + for ; Sun, 11 Oct 1998 16:53:24 -0400 (EDT) +Received: from admin.nrnet.org (uucp@localhost) + by mailhost.nrnet.org (8.8.7/8.8.4) with UUCP + id QAA16345 for pgsql-general@postgreSQL.org; Sun, 11 Oct 1998 16:28:47 -0400 +Received: from localhost (tomg@localhost) + by admin.nrnet.org (8.8.7/8.8.7) with SMTP id QAA11569 + for ; Sun, 11 Oct 1998 16:28:41 -0400 +Date: Sun, 11 Oct 1998 16:28:41 -0400 (EDT) +From: Thomas Good +To: pgsql-general@postgreSQL.org +Subject: Re: [GENERAL] Making NULLs visible. +In-Reply-To: +Message-ID: +MIME-Version: 1.0 +Content-Type: TEXT/PLAIN; charset=US-ASCII +Sender: owner-pgsql-general@postgreSQL.org +Precedence: bulk +Status: RO + +Watching all this go by...as a guy who has to move alot of data +from legacy dbs to postgres, I've gotten used to \N being a null. + +My vote, if I were allowed to cast one, would be to have one null +and that would be the COPY command null. I have no difficulty +distinguishing a null from a newline... + +At the pgsql command prompt I would find seeing \N rather reassuring. +I've seen alot of these little guys. + + ---------- Sisters of Charity Medical Center ---------- + Department of Psychiatry + ---- + Thomas Good + Coordinator, North Richmond C.M.H.C. Information Systems + 75 Vanderbilt Ave, Quarters 8 Phone: 718-354-5528 + Staten Island, NY 10304 Fax: 718-354-5056 + + + diff --git a/doc/TODO.detail/optimizer b/doc/TODO.detail/optimizer new file mode 100644 index 0000000000..aa059b8368 --- /dev/null +++ b/doc/TODO.detail/optimizer @@ -0,0 +1,987 @@ +From owner-pgsql-hackers@hub.org Mon Mar 22 18:43:41 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id SAA23978 + for ; Mon, 22 Mar 1999 18:43:39 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id SAA06472 for ; Mon, 22 Mar 1999 18:36:44 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id SAA92604; + Mon, 22 Mar 1999 18:34:23 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 22 Mar 1999 18:33:50 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id SAA92469 + for pgsql-hackers-outgoing; Mon, 22 Mar 1999 18:33:47 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from po8.andrew.cmu.edu (PO8.ANDREW.CMU.EDU [128.2.10.108]) + by hub.org (8.9.2/8.9.1) with ESMTP id SAA92456 + for ; Mon, 22 Mar 1999 18:33:41 -0500 (EST) + (envelope-from er1p+@andrew.cmu.edu) +Received: (from postman@localhost) by po8.andrew.cmu.edu (8.8.5/8.8.2) id SAA12894 for pgsql-hackers@postgresql.org; Mon, 22 Mar 1999 18:33:38 -0500 (EST) +Received: via switchmail; Mon, 22 Mar 1999 18:33:38 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Mon, 22 Mar 1999 18:27:20 -0500 (EST) +Received: from cloudy.me.cmu.edu via qmail + ID ; + Mon, 22 Mar 1999 18:27:17 -0500 (EST) +Received: from mms.4.60.Jun.27.1996.03.05.56.sun4.41.EzMail.2.0.CUILIB.3.45.SNAP.NOT.LINKED.cloudy.me.cmu.edu.sun4m.412 + via MS.5.6.cloudy.me.cmu.edu.sun4_41; + Mon, 22 Mar 1999 18:27:15 -0500 (EST) +Message-ID: +Date: Mon, 22 Mar 1999 18:27:15 -0500 (EST) +From: Erik Riedel +To: pgsql-hackers@postgreSQL.org +Subject: [HACKERS] optimizer and type question +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + + +[last week aggregation, this week, the optimizer] + +I have a somewhat general optimizer question/problem that I would like +to get some input on - i.e. I'd like to know what is "supposed" to +work here and what I should be expecting. Sadly, I think the patch +for this is more involved than my last message. + +Using my favorite table these days: + +Table = lineitem ++------------------------+----------------------------------+-------+ +| Field | Type | Length| ++------------------------+----------------------------------+-------+ +| l_orderkey | int4 not null | 4 | +| l_partkey | int4 not null | 4 | +| l_suppkey | int4 not null | 4 | +| l_linenumber | int4 not null | 4 | +| l_quantity | float4 not null | 4 | +| l_extendedprice | float4 not null | 4 | +| l_discount | float4 not null | 4 | +| l_tax | float4 not null | 4 | +| l_returnflag | char() not null | 1 | +| l_linestatus | char() not null | 1 | +| l_shipdate | date | 4 | +| l_commitdate | date | 4 | +| l_receiptdate | date | 4 | +| l_shipinstruct | char() not null | 25 | +| l_shipmode | char() not null | 10 | +| l_comment | char() not null | 44 | ++------------------------+----------------------------------+-------+ +Index: lineitem_index_ + +and the query: + +-- +-- Query 1 +-- +explain select l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, +sum(l_extendedprice) as sum_base_price, +sum(l_extendedprice*(1-l_discount)) as sum_disc_price, +sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge, +avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, +avg(l_discount) as avg_disc, count(*) as count_order +from lineitem +where l_shipdate <= '1998-09-02'::date +group by l_returnflag, l_linestatus +order by l_returnflag, l_linestatus; + + +note that I have eliminated the date calculation in my query of last +week and manually replaced it with a constant (since this wasn't +happening automatically - but let's not worry about that for now). +And this is only an explain, we care about the optimizer. So we get: + +Sort (cost=34467.88 size=0 width=0) + -> Aggregate (cost=34467.88 size=0 width=0) + -> Group (cost=34467.88 size=0 width=0) + -> Sort (cost=34467.88 size=0 width=0) + -> Seq Scan on lineitem (cost=34467.88 size=200191 width=44) + +so let's think about the selectivity that is being chosen for the +seq scan (the where l_shipdate <= '1998-09-02'). + +Turns out the optimizer is choosing "33%", even though the real answer +is somewhere in 90+% (that's how the query is designed). So, why does +it do that? + +Turns out that selectivity in this case is determined via +plancat::restriction_selectivity() which calls into functionOID = 103 +(intltsel) for operatorOID = 1096 (date "<=") on relation OID = 18663 +(my lineitem). + +This all follows because of the description of 1096 (date "<=") in +pg_operator. Looking at local1_template1.bki.source near line 1754 +shows: + +insert OID = 1096 ( "<=" PGUID 0 <...> date_le intltsel intltjoinsel ) + +where we see that indeed, it thinks "intltsel" is the right function +to use for "oprrest" in the case of dates. + +Question 1 - is intltsel the right thing for selectivity on dates? + +Hope someone is still with me. + +So now we're running selfuncs::intltsel() where we make a further call +to selfuncs::gethilokey(). The job of gethilokey is to determine the +min and max values of a particular attribute in the table, which will +then be used with the constant in my where clause to estimate the +selectivity. It is going to search the pg_statistic relation with +three key values: + +Anum_pg_statistic_starelid 18663 (lineitem) +Anum_pg_statistic_staattnum 11 (l_shipdate) +Anum_pg_statistic_staop 1096 (date "<=") + +this finds no tuples in pg_statistic. Why is that? The only nearby +tuple in pg_statistic is: + +starelid|staattnum|staop|stalokey |stahikey +--------+---------+-----+----------------+---------------- + 18663| 11| 0|01-02-1992 |12-01-1998 + +and the reason the query doesn't match anything? Because 1096 != 0. +But why is it 0 in pg_statistic? Statistics are determined near line +1844 in vacuum.c (assuming a 'vacuum analyze' run at some point) + + i = 0; + values[i++] = (Datum) relid; /* 1 */ + values[i++] = (Datum) attp->attnum; /* 2 */ +====> values[i++] = (Datum) InvalidOid; /* 3 */ + fmgr_info(stats->outfunc, &out_function); + out_string = <...min...> + values[i++] = (Datum) fmgr(F_TEXTIN, out_string); + pfree(out_string); + out_string = <...max...> + values[i++] = (Datum) fmgr(F_TEXTIN, out_string); + pfree(out_string); + stup = heap_formtuple(sd->rd_att, values, nulls); + +the "offending" line is setting the staop to InvalidOid (i.e. 0). + +Question 2 - is this right? Is the intent for 0 to serve as a +"wildcard", or should it be inserting an entry for each operation +individually? + +In the case of "wildcard" then gethilokey() should allow a match for + +Anum_pg_statistic_staop 0 + +instead of requiring the more restrictive 1096. In the current code, +what happens next is gethilokey() returns "not found" and intltsel() +returns the default 1/3 which I see in the resultant query plan (size += 200191 is 1/3 of the number of lineitem tuples). + +Question 3 - is there any inherent reason it couldn't get this right? +The statistic is in the table 1992 to 1998, so the '1998-09-02' date +should be 90-some% selectivity, a much better guess than 33%. + +Doesn't make a difference for this particular query, of course, +because the seq scan must proceed anyhow, but it could easily affect +other queries where selectivities matter (and it affects the +modifications I am trying to test in the optimizer to be "smarter" +about selectivities - my overall context is to understand/improve the +behavior that the underlying storage system sees from queries like this). + +OK, so let's say we treat 0 as a "wildcard" and stop checking for +1096. Not we let gethilokey() return the two dates from the statistic +table. The immediate next thing that intltsel() does, near lines 122 +in selfuncs.c is call atol() on the strings from gethilokey(). And +guess what it comes up with? + +low = 1 +high = 12 + +because it calls atol() on '01-02-1992' and '12-01-1998'. This +clearly isn't right, it should get some large integer that includes +the year and day in the result. Then it should compare reasonably +with my constant from the where clause and give a decent selectivity +value. This leads to a re-visit of Question 1. + +Question 4 - should date "<=" use a dateltsel() function instead of +intltsel() as oprrest? + +If anyone is still with me, could you tell me if this makes sense, or +if there is some other location where the appropriate type conversion +could take place so that intltsel() gets something reasonable when it +does the atol() calls? + +Could someone also give me a sense for how far out-of-whack the whole +current selectivity-handling structure is? It seems that most of the +operators in pg_operator actually use intltsel() and would have +type-specific problems like that described. Or is the problem in the +way attribute values are stored in pg_statistic by vacuum analyze? Or +is there another layer where type conversion belongs? + +Phew. Enough typing, hope someone can follow this and address at +least some of the questions. + +Thanks. + +Erik Riedel +Carnegie Mellon University +www.cs.cmu.edu/~riedel + + + +From owner-pgsql-hackers@hub.org Mon Mar 22 20:31:11 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id UAA00802 + for ; Mon, 22 Mar 1999 20:31:09 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id UAA13231 for ; Mon, 22 Mar 1999 20:15:20 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id UAA01981; + Mon, 22 Mar 1999 20:14:04 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 22 Mar 1999 20:13:32 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id UAA01835 + for pgsql-hackers-outgoing; Mon, 22 Mar 1999 20:13:28 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.2/8.9.1) with ESMTP id UAA01822 + for ; Mon, 22 Mar 1999 20:13:21 -0500 (EST) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id UAA23294; + Mon, 22 Mar 1999 20:12:43 -0500 (EST) +To: Erik Riedel +cc: pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] optimizer and type question +In-reply-to: Your message of Mon, 22 Mar 1999 18:27:15 -0500 (EST) + +Date: Mon, 22 Mar 1999 20:12:43 -0500 +Message-ID: <23292.922151563@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + +Erik Riedel writes: +> [ optimizer doesn't find relevant pg_statistic entry ] + +It's clearly a bug that the selectivity code is not finding this tuple. +If your analysis is correct, then selectivity estimation has *never* +worked properly, or at least not in recent memory :-(. Yipes. +Bruce and I found a bunch of other problems in the optimizer recently, +so it doesn't faze me to assume that this is broken too. + +> the "offending" line is setting the staop to InvalidOid (i.e. 0). +> Question 2 - is this right? Is the intent for 0 to serve as a +> "wildcard", + +My thought is that what the staop column ought to be is the OID of the +comparison function that was used to determine the sort order of the +column. Without a sort op the lowest and highest keys in the column are +not well defined, so it makes no sense to assert "these are the lowest +and highest values" without providing the sort op that determined that. +(For sufficiently complex data types one could reasonably have multiple +ordering operators. A crude example is sorting on "circumference" and +"area" for polygons.) But typically the sort op will be the "<" +operator for the column data type. + +So, the vacuum code is definitely broken --- it's not storing the sort +op that it used. The code in gethilokey might be broken too, depending +on how it is producing the operator it's trying to match against the +tuple. For example, if the actual operator in the query is any of +< <= > >= on int4, then int4lt ought to be used to probe the pg_statistic +table. I'm not sure if we have adequate info in pg_operator or pg_type +to let the optimizer code determine the right thing to probe with :-( + +> The immediate next thing that intltsel() does, near lines 122 +> in selfuncs.c is call atol() on the strings from gethilokey(). And +> guess what it comes up with? +> low = 1 +> high = 12 +> because it calls atol() on '01-02-1992' and '12-01-1998'. This +> clearly isn't right, it should get some large integer that includes +> the year and day in the result. Then it should compare reasonably +> with my constant from the where clause and give a decent selectivity +> value. This leads to a re-visit of Question 1. +> Question 4 - should date "<=" use a dateltsel() function instead of +> intltsel() as oprrest? + +This is clearly busted as well. I'm not sure that creating dateltsel() +is the right fix, however, because if you go down that path then every +single datatype needs its own selectivity function; that's more than we +need. + +What we really want here is to be able to map datatype values into +some sort of numeric range so that we can compute what fraction of the +low-key-to-high-key range is on each side of the probe value (the +constant taken from the query). This general concept will apply to +many scalar types, so what we want is a type-specific mapping function +and a less-specific fraction-computing-function. Offhand I'd say that +we want intltsel() and floatltsel(), plus conversion routines that can +produce either int4 or float8 from a data type as seems appropriate. +Anything that couldn't map to one or the other would have to supply its +own selectivity function. + +> Or is the problem in the +> way attribute values are stored in pg_statistic by vacuum analyze? + +Looks like it converts the low and high values to text and stores them +that way. Ugly as can be :-( but I'm not sure there is a good +alternative. We have no "wild card" column type AFAIK, which is what +these columns of pg_statistic would have to be to allow storage of +unconverted min and max values. + +I think you've found a can of worms here. Congratulations ;-) + + regards, tom lane + + +From owner-pgsql-hackers@hub.org Mon Mar 22 23:31:00 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id XAA03384 + for ; Mon, 22 Mar 1999 23:30:58 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id XAA25586 for ; Mon, 22 Mar 1999 23:18:25 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id XAA17955; + Mon, 22 Mar 1999 23:17:24 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 22 Mar 1999 23:16:49 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id XAA17764 + for pgsql-hackers-outgoing; Mon, 22 Mar 1999 23:16:46 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from po8.andrew.cmu.edu (PO8.ANDREW.CMU.EDU [128.2.10.108]) + by hub.org (8.9.2/8.9.1) with ESMTP id XAA17745 + for ; Mon, 22 Mar 1999 23:16:39 -0500 (EST) + (envelope-from er1p+@andrew.cmu.edu) +Received: (from postman@localhost) by po8.andrew.cmu.edu (8.8.5/8.8.2) id XAA04273; Mon, 22 Mar 1999 23:16:37 -0500 (EST) +Received: via switchmail; Mon, 22 Mar 1999 23:16:37 -0500 (EST) +Received: from hazy.adsl.net.cmu.edu via qmail + ID ; + Mon, 22 Mar 1999 23:15:09 -0500 (EST) +Received: from hazy.adsl.net.cmu.edu via qmail + ID ; + Mon, 22 Mar 1999 23:15:00 -0500 (EST) +Received: from mms.4.60.Jun.27.1996.03.02.53.sun4.51.EzMail.2.0.CUILIB.3.45.SNAP.NOT.LINKED.hazy.adsl.net.cmu.edu.sun4m.54 + via MS.5.6.hazy.adsl.net.cmu.edu.sun4_51; + Mon, 22 Mar 1999 23:14:55 -0500 (EST) +Message-ID: <4qxlJ0200anI01hK40@andrew.cmu.edu> +Date: Mon, 22 Mar 1999 23:14:55 -0500 (EST) +From: Erik Riedel +To: Tom Lane +Subject: Re: [HACKERS] optimizer and type question +Cc: pgsql-hackers@postgreSQL.org +In-Reply-To: <23292.922151563@sss.pgh.pa.us> +References: <23292.922151563@sss.pgh.pa.us> +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: ROr + + +OK, building on your high-level explanation, I am attaching a patch that +attempts to do something "better" than the current code. Note that I +have only tested this with the date type and my particular query. I +haven't run it through the regression, so consider it "proof of concept" +at best. Although hopefully it will serve my purposes. + +> My thought is that what the staop column ought to be is the OID of the +> comparison function that was used to determine the sort order of the +> column. Without a sort op the lowest and highest keys in the column are +> not well defined, so it makes no sense to assert "these are the lowest +> and highest values" without providing the sort op that determined that. +> +> (For sufficiently complex data types one could reasonably have multiple +> ordering operators. A crude example is sorting on "circumference" and +> "area" for polygons.) But typically the sort op will be the "<" +> operator for the column data type. +> +I changed vacuum.c to do exactly that. oid of the lt sort op. + +> So, the vacuum code is definitely broken --- it's not storing the sort +> op that it used. The code in gethilokey might be broken too, depending +> on how it is producing the operator it's trying to match against the +> tuple. For example, if the actual operator in the query is any of +> < <= > >= on int4, then int4lt ought to be used to probe the pg_statistic +> table. I'm not sure if we have adequate info in pg_operator or pg_type +> to let the optimizer code determine the right thing to probe with :-( +> +This indeed seems like a bigger problem. I thought about somehow using +type-matching from the sort op and the actual operator in the query - if +both the left and right type match, then consider them the same for +purposes of this probe. That seemed complicated, so I punted in my +example - it just does the search with relid and attnum and assumes that +only returns one tuple. This works in my case (maybe in all cases, +because of the way vacuum is currently written - ?). + +> What we really want here is to be able to map datatype values into +> some sort of numeric range so that we can compute what fraction of the +> low-key-to-high-key range is on each side of the probe value (the +> constant taken from the query). This general concept will apply to +> many scalar types, so what we want is a type-specific mapping function +> and a less-specific fraction-computing-function. Offhand I'd say that +> we want intltsel() and floatltsel(), plus conversion routines that can +> produce either int4 or float8 from a data type as seems appropriate. +> Anything that couldn't map to one or the other would have to supply its +> own selectivity function. +> +This is what my example then does. Uses the stored sort op to get the +type and then uses typinput to convert from the string to an int4. + +Then puts the int4 back into string format because that's what everyone +was expecting. + +It seems to work for my particular query. I now get: + +(selfuncs) gethilokey() obj 18663 attr 11 opid 1096 (ignored) +(selfuncs) gethilokey() found op 1087 in pg_proc +(selfuncs) gethilokey() found type 1082 in pg_type +(selfuncs) gethilokey() going to use 1084 to convert type 1082 +(selfuncs) gethilokey() have low -2921 high -396 +(selfuncs) intltsel() high -396 low -2921 val -486 +(plancat) restriction_selectivity() for func 103 op 1096 rel 18663 attr +11 const -486 flag 3 returns 0.964356 +NOTICE: QUERY PLAN: + +Sort (cost=34467.88 size=0 width=0) + -> Aggregate (cost=34467.88 size=0 width=0) + -> Group (cost=34467.88 size=0 width=0) + -> Sort (cost=34467.88 size=0 width=0) + -> Seq Scan on lineitem (cost=34467.88 size=579166 width=44) + +including my printfs, which exist in the patch as well. + +Selectivity is now the expected 96% and the size estimate for the seq +scan is much closer to correct. + +Again, not tested with anything besides date, so caveat not-tested. + +Hope this helps. + +Erik + +----------------------[optimizer_fix.sh]------------------------ + +#! /bin/sh +# This is a shell archive, meaning: +# 1. Remove everything above the #! /bin/sh line. +# 2. Save the resulting text in a file. +# 3. Execute the file with /bin/sh (not csh) to create: +# selfuncs.c.diff +# vacuum.c.diff +# This archive created: Mon Mar 22 22:58:14 1999 +export PATH; PATH=/bin:/usr/bin:$PATH +if test -f 'selfuncs.c.diff' +then + echo shar: "will not over-write existing file 'selfuncs.c.diff'" +else +cat << \SHAR_EOF > 'selfuncs.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/utils/adt +/selfuncs.c Thu Mar 11 23:59:35 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/615/src/backend/utils/adt +/selfuncs.c Mon Mar 22 22:57:25 1999 +*************** +*** 32,37 **** +--- 32,40 ---- + #include "utils/lsyscache.h" /* for get_oprrest() */ + #include "catalog/pg_statistic.h" + ++ #include "catalog/pg_proc.h" /* for Form_pg_proc */ ++ #include "catalog/pg_type.h" /* for Form_pg_type */ ++ + /* N is not a valid var/constant or relation id */ + #define NONVALUE(N) ((N) == -1) + +*************** +*** 103,110 **** + bottom; + + result = (float64) palloc(sizeof(float64data)); +! if (NONVALUE(attno) || NONVALUE(relid)) + *result = 1.0 / 3; + else + { + /* XXX val = atol(value); */ +--- 106,114 ---- + bottom; + + result = (float64) palloc(sizeof(float64data)); +! if (NONVALUE(attno) || NONVALUE(relid)) { + *result = 1.0 / 3; ++ } + else + { + /* XXX val = atol(value); */ +*************** +*** 117,130 **** + } + high = atol(highchar); + low = atol(lowchar); + if ((flag & SEL_RIGHT && val < low) || + (!(flag & SEL_RIGHT) && val > high)) + { + float32data nvals; + + nvals = getattdisbursion(relid, (int) attno); +! if (nvals == 0) + *result = 1.0 / 3.0; + else + { + *result = 3.0 * (float64data) nvals; +--- 121,136 ---- + } + high = atol(highchar); + low = atol(lowchar); ++ printf("(selfuncs) intltsel() high %d low %d val %d\n",high,low,val); + if ((flag & SEL_RIGHT && val < low) || + (!(flag & SEL_RIGHT) && val > high)) + { + float32data nvals; + + nvals = getattdisbursion(relid, (int) attno); +! if (nvals == 0) { + *result = 1.0 / 3.0; ++ } + else + { + *result = 3.0 * (float64data) nvals; +*************** +*** 336,341 **** +--- 342,353 ---- + { + Relation rel; + HeapScanDesc scan; ++ /* this assumes there is only one row in the statistics table for any +particular */ ++ /* relid, attnum pair - could be more complicated if staop is also +used. */ ++ /* at the moment, if there are multiple rows, this code ends up +picking the */ ++ /* "first" one + - er1p */ ++ /* the actual "ignoring" is done in the call to heap_beginscan() +below, where */ ++ /* we only mention 2 of the 3 keys in this array + - er1p */ + static ScanKeyData key[3] = { + {0, Anum_pg_statistic_starelid, F_OIDEQ, {0, 0, F_OIDEQ}}, + {0, Anum_pg_statistic_staattnum, F_INT2EQ, {0, 0, F_INT2EQ}}, +*************** +*** 344,355 **** + bool isnull; + HeapTuple tuple; + + rel = heap_openr(StatisticRelationName); + + key[0].sk_argument = ObjectIdGetDatum(relid); + key[1].sk_argument = Int16GetDatum((int16) attnum); + key[2].sk_argument = ObjectIdGetDatum(opid); +! scan = heap_beginscan(rel, 0, SnapshotNow, 3, key); + tuple = heap_getnext(scan, 0); + if (!HeapTupleIsValid(tuple)) + { +--- 356,377 ---- + bool isnull; + HeapTuple tuple; + ++ HeapTuple tup; ++ Form_pg_proc proc; ++ Form_pg_type typ; ++ Oid which_op; ++ Oid which_type; ++ int32 low_value; ++ int32 high_value; ++ + rel = heap_openr(StatisticRelationName); + + key[0].sk_argument = ObjectIdGetDatum(relid); + key[1].sk_argument = Int16GetDatum((int16) attnum); + key[2].sk_argument = ObjectIdGetDatum(opid); +! printf("(selfuncs) gethilokey() obj %d attr %d opid %d (ignored)\n", +! key[0].sk_argument,key[1].sk_argument,key[2].sk_argument); +! scan = heap_beginscan(rel, 0, SnapshotNow, 2, key); + tuple = heap_getnext(scan, 0); + if (!HeapTupleIsValid(tuple)) + { +*************** +*** 376,383 **** +--- 398,461 ---- + &isnull)); + if (isnull) + elog(DEBUG, "gethilokey: low key is null"); ++ + heap_endscan(scan); + heap_close(rel); ++ ++ /* now we deal with type conversion issues + */ ++ /* when intltsel() calls this routine (who knows what other callers +might do) */ ++ /* it assumes that it can call atol() on the strings and then use +integer */ ++ /* comparison from there. what we are going to do here, then, is try +to use */ ++ /* the type information from Anum_pg_statistic_staop to convert the +high */ ++ /* and low values +- er1p */ ++ ++ /* WARNING: this code has only been tested with the date type and has +NOT */ ++ /* been regression tested. consider it "sample" code of what might +be the */ ++ /* right kind of thing to do +- er1p */ ++ ++ /* get the 'op' from pg_statistic and look it up in pg_proc */ ++ which_op = heap_getattr(tuple, ++ Anum_pg_statistic_staop, ++ RelationGetDescr(rel), ++ &isnull); ++ if (InvalidOid == which_op) { ++ /* ignore all this stuff, try conversion only if we have a valid staop */ ++ /* note that there is an accompanying change to 'vacuum analyze' that */ ++ /* gets this set to something useful. */ ++ } else { ++ /* staop looks valid, so let's see what we can do about conversion */ ++ tup = SearchSysCacheTuple(PROOID, ObjectIdGetDatum(which_op), 0, 0, 0); ++ if (!HeapTupleIsValid(tup)) { ++ elog(ERROR, "selfuncs: unable to find op in pg_proc %d", which_op); ++ } ++ printf("(selfuncs) gethilokey() found op %d in pg_proc\n",which_op); ++ ++ /* use that to determine the type of stahikey and stalokey via pg_type */ ++ proc = (Form_pg_proc) GETSTRUCT(tup); ++ which_type = proc->proargtypes[0]; /* XXX - use left and right +separately? */ ++ tup = SearchSysCacheTuple(TYPOID, ObjectIdGetDatum(which_type), 0, 0, 0); ++ if (!HeapTupleIsValid(tup)) { ++ elog(ERROR, "selfuncs: unable to find type in pg_type %d", which_type); ++ } ++ printf("(selfuncs) gethilokey() found type %d in pg_type\n",which_type); ++ ++ /* and use that type to get the conversion function to int4 */ ++ typ = (Form_pg_type) GETSTRUCT(tup); ++ printf("(selfuncs) gethilokey() going to use %d to convert type +%d\n",typ->typinput,which_type); ++ ++ /* and convert the low and high strings */ ++ low_value = (int32) fmgr(typ->typinput, *low, -1); ++ high_value = (int32) fmgr(typ->typinput, *high, -1); ++ printf("(selfuncs) gethilokey() have low %d high +%d\n",low_value,high_value); ++ ++ /* now we have int4's, which we put back into strings because +that's what out */ ++ /* callers (intltsel() at least) expect + - er1p */ ++ pfree(*low); pfree(*high); /* let's not leak the old strings */ ++ *low = int4out(low_value); ++ *high = int4out(high_value); ++ ++ /* XXX - this probably leaks the two tups we got from +SearchSysCacheTuple() - er1p */ ++ } + } + + float64 +SHAR_EOF +fi +if test -f 'vacuum.c.diff' +then + echo shar: "will not over-write existing file 'vacuum.c.diff'" +else +cat << \SHAR_EOF > 'vacuum.c.diff' +*** +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/611/src/backend/commands/ +vacuum.c Thu Mar 11 23:59:09 1999 +--- +/afs/ece.cmu.edu/project/lcs/lcs-004/er1p/postgres/615/src/backend/commands/ +vacuum.c Mon Mar 22 21:23:15 1999 +*************** +*** 1842,1848 **** + i = 0; + values[i++] = (Datum) relid; /* 1 */ + values[i++] = (Datum) attp->attnum; /* 2 */ +! values[i++] = (Datum) InvalidOid; /* 3 */ + fmgr_info(stats->outfunc, &out_function); + out_string = (*fmgr_faddr(&out_function)) (stats->min, +stats->attr->atttypid); + values[i++] = (Datum) fmgr(F_TEXTIN, out_string); +--- 1842,1848 ---- + i = 0; + values[i++] = (Datum) relid; /* 1 */ + values[i++] = (Datum) attp->attnum; /* 2 */ +! values[i++] = (Datum) stats->f_cmplt.fn_oid; /* 3 */ /* get the +'<' oid, instead of 'invalid' - er1p */ + fmgr_info(stats->outfunc, &out_function); + out_string = (*fmgr_faddr(&out_function)) (stats->min, +stats->attr->atttypid); + values[i++] = (Datum) fmgr(F_TEXTIN, out_string); +SHAR_EOF +fi +exit 0 +# End of shell archive + + + +From owner-pgsql-hackers@hub.org Tue Mar 23 12:31:05 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id MAA17491 + for ; Tue, 23 Mar 1999 12:31:04 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA08839 for ; Tue, 23 Mar 1999 12:08:14 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id MAA93649; + Tue, 23 Mar 1999 12:04:57 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 23 Mar 1999 12:03:00 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id MAA93355 + for pgsql-hackers-outgoing; Tue, 23 Mar 1999 12:02:55 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.2/8.9.1) with ESMTP id MAA93336 + for ; Tue, 23 Mar 1999 12:02:43 -0500 (EST) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id MAA24455; + Tue, 23 Mar 1999 12:01:57 -0500 (EST) +To: Erik Riedel +cc: pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] optimizer and type question +In-reply-to: Your message of Mon, 22 Mar 1999 23:14:55 -0500 (EST) + <4qxlJ0200anI01hK40@andrew.cmu.edu> +Date: Tue, 23 Mar 1999 12:01:57 -0500 +Message-ID: <24453.922208517@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Erik Riedel writes: +> OK, building on your high-level explanation, I am attaching a patch that +> attempts to do something "better" than the current code. Note that I +> have only tested this with the date type and my particular query. + +Glad to see you working on this. I don't like the details of your +patch too much though ;-). Here are some suggestions for making it +better. + +1. I think just removing staop from the lookup in gethilokey is OK for +now, though I'm dubious about Bruce's thought that we could delete that +field entirely. As you observe, vacuum will not currently put more +than one tuple for a column into pg_statistic, so we can just do the +lookup with relid and attno and leave it at that. But I think we ought +to leave the field there, with the idea that vacuum might someday +compute more than one statistic for a data column. Fixing vacuum to +put its sort op into the field is a good idea in the meantime. + +2. The type conversion you're doing in gethilokey is a mess; I think +what you ought to make it do is simply the inbound conversion of the +string from pg_statistic into the internal representation for the +column's datatype, and return that value as a Datum. It also needs +a cleaner success/failure return convention --- this business with +"n" return is ridiculously type-specific. Also, the best and easiest +way to find the type to convert to is to look up the column type in +the info for the given relid, not search pg_proc with the staop value. +(I'm not sure that will even work, since there are pg_proc entries +with wildcard argument types.) + +3. The atol() calls currently found in intltsel are a type-specific +cheat on what is conceptually a two-step process: + * Convert the string stored in pg_statistic back to the internal + form for the column data type. + * Generate a numeric representation of the data value that can be + used as an estimate of the range of values in the table. +The second step is trivial for integers, which may obscure the fact +that there are two steps involved, but nonetheless there are. If +you think about applying selectivity logic to strings, say, it +becomes clear that the second step is a necessary component of the +process. Furthermore, the second step must also be applied to the +probe value that's being passed into the selectivity operator. +(The probe value is already in internal form, of course; but it is +not necessarily in a useful numeric form.) + +We can do the first of these steps by applying the appropriate "XXXin" +conversion function for the column data type, as you have done. The +interesting question is how to do the second one. A really clean +solution would require adding a column to pg_type that points to a +function that will do the appropriate conversion. I'd be inclined to +make all of these functions return "double" (float8) and just have one +top-level selectivity routine for all data types that can use +range-based selectivity logic. + +We could probably hack something together that would not use an explicit +conversion function for each data type, but instead would rely on +type-specific assumptions inside the selectivity routines. We'd need many +more selectivity routines though (at least one for each of int, float4, +float8, and text data types) so I'm not sure we'd really save any work +compared to doing it right. + +BTW, now that I look at this issue it's real clear that the selectivity +entries in pg_operator are horribly broken. The intltsel/intgtsel +selectivity routines are currently applied to 32 distinct data types: + +regression=> select distinct typname,oprleft from pg_operator, pg_type +regression-> where pg_type.oid = oprleft +regression-> and oprrest in (103,104); +typname |oprleft +---------+------- +_aclitem | 1034 +abstime | 702 +bool | 16 +box | 603 +bpchar | 1042 +char | 18 +cidr | 650 +circle | 718 +date | 1082 +datetime | 1184 +float4 | 700 +float8 | 701 +inet | 869 +int2 | 21 +int4 | 23 +int8 | 20 +line | 628 +lseg | 601 +macaddr | 829 +money | 790 +name | 19 +numeric | 1700 +oid | 26 +oid8 | 30 +path | 602 +point | 600 +polygon | 604 +text | 25 +time | 1083 +timespan | 1186 +timestamp| 1296 +varchar | 1043 +(32 rows) + +many of which are very obviously not compatible with integer for *any* +purpose. It looks to me like a lot of data types were added to +pg_operator just by copy-and-paste, without paying attention to whether +the selectivity routines were actually correct for the data type. + +As the code stands today, the bogus entries don't matter because +gethilokey always fails, so we always get 1/3 as the selectivity +estimate for any comparison operator (except = and != of course). +I had actually noticed that fact and assumed that it was supposed +to work that way :-(. But, clearly, there is code in here that +is *trying* to be smarter. + +As soon as we fix gethilokey so that it can succeed, we will start +getting essentially-random selectivity estimates for those data types +that aren't actually binary-compatible with integer. That will not do; +we have to do something about the issue. + + regards, tom lane + + +From tgl@sss.pgh.pa.us Tue Mar 23 12:31:02 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id MAA17484 + for ; Tue, 23 Mar 1999 12:31:01 -0500 (EST) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA09042 for ; Tue, 23 Mar 1999 12:10:55 -0500 (EST) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id MAA24474; + Tue, 23 Mar 1999 12:09:52 -0500 (EST) +To: Bruce Momjian +cc: riedel+@CMU.EDU, pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] optimizer and type question +In-reply-to: Your message of Mon, 22 Mar 1999 21:25:45 -0500 (EST) + <199903230225.VAA01641@candle.pha.pa.us> +Date: Tue, 23 Mar 1999 12:09:52 -0500 +Message-ID: <24471.922208992@sss.pgh.pa.us> +From: Tom Lane +Status: RO + +Bruce Momjian writes: +> What we really need is some way to determine how far the requested value +> is from the min/max values. With int, we just do (val-min)/(max-min). +> That works, but how do we do that for types that don't support division. +> Strings come to mind in this case. + +What I'm envisioning is that we still apply the (val-min)/(max-min) +logic, but apply it to numeric values that are produced in a +type-dependent way. + +For ints and floats the conversion is trivial, of course. + +For strings, the first thing that comes to mind is to return 0 for a +null string and the value of the first byte for a non-null string. +This would give you one-part-in-256 selectivity which is plenty good +enough for what the selectivity code needs to do. (Actually, it's +only that good if the strings' first bytes are pretty well spread out. +If you have a table containing English words, for example, you might +only get about one part in 26 this way, since the first bytes will +probably only run from A to Z. Might be better to use the first two +characters of the string to compute the selectivity representation.) + +In general, you can apply this logic as long as you can come up with +some numerical approximation to the data type's sorting order. It +doesn't have to be exact. + + regards, tom lane + +From owner-pgsql-hackers@hub.org Tue Mar 23 12:31:03 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id MAA17488 + for ; Tue, 23 Mar 1999 12:31:02 -0500 (EST) +Received: from hub.org (majordom@hub.org [209.47.145.100]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA09987 for ; Tue, 23 Mar 1999 12:21:34 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.2/8.9.1) with SMTP id MAA95155; + Tue, 23 Mar 1999 12:18:33 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Tue, 23 Mar 1999 12:17:00 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.2/8.9.1) id MAA94857 + for pgsql-hackers-outgoing; Tue, 23 Mar 1999 12:16:56 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) + by hub.org (8.9.2/8.9.1) with ESMTP id MAA94469 + for ; Tue, 23 Mar 1999 12:11:33 -0500 (EST) + (envelope-from tgl@sss.pgh.pa.us) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id MAA24474; + Tue, 23 Mar 1999 12:09:52 -0500 (EST) +To: Bruce Momjian +cc: riedel+@CMU.EDU, pgsql-hackers@postgreSQL.org +Subject: Re: [HACKERS] optimizer and type question +In-reply-to: Your message of Mon, 22 Mar 1999 21:25:45 -0500 (EST) + <199903230225.VAA01641@candle.pha.pa.us> +Date: Tue, 23 Mar 1999 12:09:52 -0500 +Message-ID: <24471.922208992@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Bruce Momjian writes: +> What we really need is some way to determine how far the requested value +> is from the min/max values. With int, we just do (val-min)/(max-min). +> That works, but how do we do that for types that don't support division. +> Strings come to mind in this case. + +What I'm envisioning is that we still apply the (val-min)/(max-min) +logic, but apply it to numeric values that are produced in a +type-dependent way. + +For ints and floats the conversion is trivial, of course. + +For strings, the first thing that comes to mind is to return 0 for a +null string and the value of the first byte for a non-null string. +This would give you one-part-in-256 selectivity which is plenty good +enough for what the selectivity code needs to do. (Actually, it's +only that good if the strings' first bytes are pretty well spread out. +If you have a table containing English words, for example, you might +only get about one part in 26 this way, since the first bytes will +probably only run from A to Z. Might be better to use the first two +characters of the string to compute the selectivity representation.) + +In general, you can apply this logic as long as you can come up with +some numerical approximation to the data type's sorting order. It +doesn't have to be exact. + + regards, tom lane + + diff --git a/doc/TODO.detail/outer b/doc/TODO.detail/outer new file mode 100644 index 0000000000..f0ab8577da --- /dev/null +++ b/doc/TODO.detail/outer @@ -0,0 +1,313 @@ +From lockhart@alumni.caltech.edu Thu Jan 7 13:31:08 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA07771 + for ; Thu, 7 Jan 1999 13:31:06 -0500 (EST) +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-2.jpl.nasa.gov [128.149.68.204]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id NAA14597 for ; Thu, 7 Jan 1999 13:27:37 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id SAA13416; + Thu, 7 Jan 1999 18:26:56 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <3694FC70.FAD67BC3@alumni.caltech.edu> +Date: Thu, 07 Jan 1999 18:26:56 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.30 i686) +MIME-Version: 1.0 +To: Bruce Momjian +CC: Postgres Hackers List +Subject: Outer Joins (and need CASE help) +References: <199901071747.MAA07054@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: RO + +> Thomas, do you need help on outer joins? + +Yes. I'm going slowly partly because I get distracted with other +Postgres stuff like docs, and partly because I don't understand all of +the pieces I'm working with. + +I've identified the place in the MergeJoin code where the null filling +for outer joins needs to happen, and have the "merge walk" code done. +But I don't have the supporting code which actually would know how to +null-fill a result tuple from the left or right. I thought you might be +interested in that? + +I've done some work in the parser, and can now do things like: + +postgres=> select * from t1 join t2 using (i); +NOTICE: JOIN not yet implemented +i|j|i|k +-+-+-+- +1|2|1|3 +(1 row) + +But this is just an inner join, and the result isn't quite right since +the second "i" column should probably be omitted. At the moment I +transform it from the syntax above into existing parse nodes, and +everything from there on works. + +I don't yet pass an explicit join node into the planner/optimizer, and +that will be the hardest part I assume. Perhaps we can work on that +together. + +So, what I'll try to do (soon, in the next few days?) is put in + + #ifdef ENABLE_OUTER_JOINS + +conditional code into the parser area (already there for the executor) +and commit everything to the development tree. Does that sound OK? + +Oh, and if anyone is looking for something to do, I've got a couple of +CASE statements in the case.sql regression test which are commented out +because they crash the backend. They involve references to multiple +tables within a single result column, and in other contexts that +construct works. It would be great if someone had time to track it +down... + + - Tom + +From lockhart@alumni.caltech.edu Mon Feb 22 02:01:13 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id CAA22073 + for ; Mon, 22 Feb 1999 02:01:12 -0500 (EST) +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-2.jpl.nasa.gov [128.149.68.204]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id BAA26054 for ; Mon, 22 Feb 1999 01:57:00 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id GAA04715; + Mon, 22 Feb 1999 06:56:36 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <36D0FFA4.32ADB75C@alumni.caltech.edu> +Date: Mon, 22 Feb 1999 06:56:36 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.36 i686) +MIME-Version: 1.0 +To: Bruce Momjian +CC: hackers@postgreSQL.org +Subject: Re: start on outer join +References: <199902220304.WAA10066@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: ROr + +Bruce Momjian wrote: +> +> > Will apply ... some other changes laying a bit of +> > groundwork for outer joins so you can start on the planner/optimizer +> > parts :) +> Those will be a synch now that I understand the optimizer. In fact, I +> think it all will happen in the executor. + +I've modified executor/nodeMergeJoin.c to walk a left/right/both outer +join, but didn't fill in the part which actually creates the result +tuple (which will be the current left- or right-side tuple plus nulls +for filler). I hope this is up your alley :) + +So far, I'm not certain what to pass to the planner. The syntax leads me +to pass a select structure from gram.y with a "JoinExpr" structure in +the "fromClause" list. I need to expand that with a combination of +column names and qualifications, but at the time I see the JoinExpr I +don't have access to the top query structure itself. So I may just keep +a modestly transformed JoinExpr to expand later or to pass to the +planner. + +btw, the EXCEPT/INTERSECT stuff from Stefan has some ugliness in gram.y +which needs to be fixed (the shift/reduce conflict is not acceptable for +our release version) and some of that code clearly needs to move to +analyze.c or some other module. + + - Tom + +From maillist Wed Feb 24 05:27:08 1999 +Received: (from maillist@localhost) + by candle.pha.pa.us (8.9.0/8.9.0) id FAA09648; + Wed, 24 Feb 1999 05:27:08 -0500 (EST) +From: Bruce Momjian +Message-Id: <199902241027.FAA09648@candle.pha.pa.us> +Subject: Re: [HACKERS] OUTER joins +In-Reply-To: <199902240953.EAA08561@candle.pha.pa.us> from Bruce Momjian at "Feb 24, 1999 4:53:21 am" +To: maillist@candle.pha.pa.us (Bruce Momjian) +Date: Wed, 24 Feb 1999 05:27:07 -0500 (EST) +Cc: lockhart@alumni.caltech.edu, hackers@postgreSQL.org +X-Mailer: ELM [version 2.4ME+ PL47 (25)] +MIME-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Status: RO + +> +> How do you propose doing outer joins in non-mergejoin situations? +> Mergejoins can only be used currently in equal joins. + +Is your solution going to be to make sure the OUTER table is always a +MergeJoin, or on the outside of a join loop? That could work. + +That could get tricky if the table is joined to _two_ other tables. +With the cleaned-up optimizer, we can disable non-merge joins in certain +circumstances, and prevent OUTER tables from being inner in the others. +Is that the plan? + +-- + Bruce Momjian | http://www.op.net/~candle + maillist@candle.pha.pa.us | (610) 853-3000 + + If your life is a hard drive, | 830 Blythe Avenue + + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 + +From lockhart@alumni.caltech.edu Mon Mar 1 13:01:08 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id NAA21672 + for ; Mon, 1 Mar 1999 13:01:06 -0500 (EST) +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-2.jpl.nasa.gov [128.149.68.204]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id MAA12756 for ; Mon, 1 Mar 1999 12:14:16 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id RAA09406; + Mon, 1 Mar 1999 17:10:49 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <36DACA19.E6DBE7D8@alumni.caltech.edu> +Date: Mon, 01 Mar 1999 17:10:49 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.36 i686) +MIME-Version: 1.0 +To: Bruce Momjian +CC: PostgreSQL-development +Subject: Re: OUTER joins +References: <199902240953.EAA08561@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: ROr + +(back from a short vacation...) + +> How do you propose doing outer joins in non-mergejoin situations? +> Mergejoins can only be used currently in equal joins. + +Hadn't thought about it, other than figuring that implementing the +equi-join first was a good start. There is a class of outer join syntax +(the USING clause) which is implicitly an equi-join... + + - Tom + +From lockhart@alumni.caltech.edu Mon Mar 8 21:55:02 1999 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id VAA15978 + for ; Mon, 8 Mar 1999 21:54:57 -0500 (EST) +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-1.jpl.nasa.gov [128.149.68.203]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id VAA15837 for ; Mon, 8 Mar 1999 21:48:33 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id CAA06996; + Tue, 9 Mar 1999 02:46:40 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <36E48B90.F3E902B7@alumni.caltech.edu> +Date: Tue, 09 Mar 1999 02:46:40 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.36 i686) +MIME-Version: 1.0 +To: Bruce Momjian +CC: hackers@postgreSQL.org +Subject: Re: OUTER joins +References: <199903070325.WAA10357@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: ROr + +> > Hadn't thought about it, other than figuring that implementing the +> > equi-join first was a good start. There is a class of outer join +> > syntax (the USING clause) which is implicitly an equi-join... +> Not that easy. You don't automatically get a mergejoin from an +> equijoin. I will have to force outer's to be either mergejoins, or +> inners of non-merge joins. Can you add code to non-merge joins in the +> executor to throw out a null row if it does not find an inner match +> for the outer row, and I will handle the optimizer so it doesn't throw +> a non-conforming plan to the executor. + +So far I don't have enough info in the parser to get the +planner/optimizer going. Should we work from the front to the back, or +should I go ahead and look at the non-merge joins? It's painfully +obvious that I don't know anything about the middle parts of this to +proceed without lots more research. + + - Tom + +From lockhart@alumni.caltech.edu Tue Mar 9 22:47:57 1999 +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-1.jpl.nasa.gov [128.149.68.203]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id WAA07869 + for ; Tue, 9 Mar 1999 22:47:54 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id DAA14761; + Wed, 10 Mar 1999 03:46:43 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <36E5EB23.F5CD959B@alumni.caltech.edu> +Date: Wed, 10 Mar 1999 03:46:43 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.36 i686) +MIME-Version: 1.0 +To: Bruce Momjian , tgl@mythos.jpl.nasa.gov +Subject: Re: SQL outer +References: <199903100112.UAA05772@candle.pha.pa.us> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: RO + +> select * +> from outer tab1, tab2, tab3 +> where tab1.col1 = tab2.col1 and +> tab1.col1 = tab3.col1 + +select * +from t1 left join t2 using (c1) + join t3 on (c1 = t3.c1) + +Result: +t1.c1 t1.c2 t2.c2 t3.c1 +2 12 NULL 32 + +t1: +c1 c2 +1 11 +2 12 +3 13 +4 14 + +t2: +c1 c2 +1 21 +3 23 + +t3: +c1 c2 +2 32 + +From lockhart@alumni.caltech.edu Wed Mar 10 10:48:54 1999 +Received: from golem.jpl.nasa.gov (IDENT:root@hectic-1.jpl.nasa.gov [128.149.68.203]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id KAA16741 + for ; Wed, 10 Mar 1999 10:48:51 -0500 (EST) +Received: from alumni.caltech.edu (localhost [127.0.0.1]) + by golem.jpl.nasa.gov (8.8.5/8.8.5) with ESMTP id PAA17723; + Wed, 10 Mar 1999 15:48:31 GMT +Sender: tgl@mythos.jpl.nasa.gov +Message-ID: <36E6944F.1F93B08@alumni.caltech.edu> +Date: Wed, 10 Mar 1999 15:48:31 +0000 +From: "Thomas G. Lockhart" +Organization: Caltech/JPL +X-Mailer: Mozilla 4.07 [en] (X11; I; Linux 2.0.36 i686) +MIME-Version: 1.0 +To: Bruce Momjian +CC: Thomas Lockhart +Subject: Re: SQL outer +References: <199903100112.UAA05772@candle.pha.pa.us> <36E5EB23.F5CD959B@alumni.caltech.edu> +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Status: ROr + +Just thinking... + +If the initial RelOptInfo groupings are derived from the WHERE clause +expressions, how about marking the "outer" property in those expressions +in the parser? istm that is where the parser knows about two tables in +one place, and I'm generating those expressions anyway. We could add a +field(s) to the expression structure, or pass along a slightly different +structure... + + - Tom + diff --git a/doc/TODO.detail/performance b/doc/TODO.detail/performance new file mode 100644 index 0000000000..abe1f67619 --- /dev/null +++ b/doc/TODO.detail/performance @@ -0,0 +1,343 @@ +From owner-pgsql-hackers@hub.org Sun Jun 14 18:45:04 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id SAA03690 + for ; Sun, 14 Jun 1998 18:45:00 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id SAA28049; Sun, 14 Jun 1998 18:39:42 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 14 Jun 1998 18:36:06 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id SAA27943 for pgsql-hackers-outgoing; Sun, 14 Jun 1998 18:36:04 -0400 (EDT) +Received: from angular.illustra.com (ifmxoak.illustra.com [206.175.10.34]) by hub.org (8.8.8/8.7.5) with ESMTP id SAA27925 for ; Sun, 14 Jun 1998 18:35:47 -0400 (EDT) +Received: from hawk.illustra.com (hawk.illustra.com [158.58.61.70]) by angular.illustra.com (8.7.4/8.7.3) with SMTP id PAA21293 for ; Sun, 14 Jun 1998 15:35:12 -0700 (PDT) +Received: by hawk.illustra.com (5.x/smail2.5/06-10-94/S) + id AA07922; Sun, 14 Jun 1998 15:35:13 -0700 +From: dg@illustra.com (David Gould) +Message-Id: <9806142235.AA07922@hawk.illustra.com> +Subject: [HACKERS] performance tests, initial results +To: pgsql-hackers@postgreSQL.org +Date: Sun, 14 Jun 1998 15:35:13 -0700 (PDT) +Mime-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + + +I have been playing a little with the performance tests found in +pgsql/src/tests/performance and have a few observations that might be of +minor interest. + +The tests themselves are simple enough although the result parsing in the +driver did not work on Linux. I am enclosing a patch below to fix this. I +think it will also work better on the other systems. + +A summary of results from my testing are below. Details are at the bottom +of this message. + +My test system is 'leslie': + + linux 2.0.32, gcc version 2.7.2.3 + P133, HX chipset, 512K L2, 32MB mem + NCR810 fast scsi, Quantum Atlas 2GB drive (7200 rpm). + + + Results Summary (times in seconds) + + Single txn 8K txn Create 8K idx 8K random Simple +Case Description 8K insert 8K insert Index Insert Scans Orderby +=================== ========== ========= ====== ====== ========= ======= +1 From Distribution + P90 FreeBsd -B256 39.56 1190.98 3.69 46.65 65.49 2.27 + IDE + +2 Running on leslie + P133 Linux 2.0.32 15.48 326.75 2.99 20.69 35.81 1.68 + SCSI 32M + +3 leslie, -o -F + no forced writes 15.90 24.98 2.63 20.46 36.43 1.69 + +4 leslie, -o -F + no ASSERTS 14.92 23.23 1.38 18.67 33.79 1.58 + +5 leslie, -o -F -B2048 + more buffers 21.31 42.28 2.65 25.74 42.26 1.72 + +6 leslie, -o -F -B2048 + more bufs, no ASSERT 20.52 39.79 1.40 24.77 39.51 1.55 + + + + + Case to Case Difference Factors (+ is faster) + + Single txn 8K txn Create 8K idx 8K random Simple +Case Description 8K insert 8K insert Index Insert Scans Orderby +=================== ========== ========= ====== ====== ========= ======= + +leslie vs BSD P90. 2.56 3.65 1.23 2.25 1.83 1.35 + +(noflush -F) vs no -F -1.03 13.08 1.14 1.01 -1.02 1.00 + +No Assert vs Assert 1.05 1.07 1.90 1.06 1.07 1.09 + +-B256 vs -B2048 1.34 1.69 1.01 1.26 1.16 1.02 + + +Observations: + + - leslie (P133 linux) appears to be about 1.8 times faster than the + P90 BSD system used for the test result distributed with the source, not + counting the 8K txn insert case which was completely disk bound. + + - SCSI disks make a big (factor of 3.6) difference. During this test the + disk was hammering and cpu utilization was < 10%. + + - Assertion checking seems to cost about 7% except for create index where + it costs 90% + + - the -F option to avoid flushing buffers has tremendous effect if there are + many very small transactions. Or, another way, flushing at the end of the + transaction is a major disaster for performance. + + - Something is very wrong with our buffer cache implementation. Going from + 256 buffers to 2048 buffers costs an average of 25%. In the 8K txn case + it costs about 70%. I see looking at the code and profiling that in the 8K + txn case this is in BufferSync() which examines all the buffers at commit + time. I don't quite understand why it is so costly for the single 8K row + txn (35%) though. + +It would be nice to have some more tests. Maybe the Wisconsin stuff will +be useful. + + + +----------------- patch to test harness. apply from pgsql ------------ +*** src/test/performance/runtests.pl.orig Sun Jun 14 11:34:04 1998 + +Differences % + + +----------------- patch to test harness. apply from pgsql ------------ +*** src/test/performance/runtests.pl.orig Sun Jun 14 11:34:04 1998 +--- src/test/performance/runtests.pl Sun Jun 14 12:07:30 1998 +*************** +*** 84,123 **** + open (STDERR, ">$TmpFile") or die; + select (STDERR); $| = 1; + +! for ($i = 0; $i <= $#perftests; $i++) +! { + $test = $perftests[$i]; + ($test, $XACTBLOCK) = split (/ /, $test); + $runtest = $test; +! if ( $test =~ /\.ntm/ ) +! { +! # + # No timing for this queries +- # + close (STDERR); # close $TmpFile + open (STDERR, ">/dev/null") or die; + $runtest =~ s/\.ntm//; + } +! else +! { + close (STDOUT); + open(STDOUT, ">&SAVEOUT"); + print STDOUT "\nRunning: $perftests[$i+1] ..."; + close (STDOUT); + open (STDOUT, ">/dev/null") or die; + select (STDERR); $| = 1; +! printf "$perftests[$i+1]: "; + } + + do "sqls/$runtest"; + + # Restore STDERR to $TmpFile +! if ( $test =~ /\.ntm/ ) +! { + close (STDERR); + open (STDERR, ">>$TmpFile") or die; + } +- + select (STDERR); $| = 1; + $i++; + } +--- 84,116 ---- + open (STDERR, ">$TmpFile") or die; + select (STDERR); $| = 1; + +! for ($i = 0; $i <= $#perftests; $i++) { + $test = $perftests[$i]; + ($test, $XACTBLOCK) = split (/ /, $test); + $runtest = $test; +! if ( $test =~ /\.ntm/ ) { + # No timing for this queries + close (STDERR); # close $TmpFile + open (STDERR, ">/dev/null") or die; + $runtest =~ s/\.ntm//; + } +! else { + close (STDOUT); + open(STDOUT, ">&SAVEOUT"); + print STDOUT "\nRunning: $perftests[$i+1] ..."; + close (STDOUT); + open (STDOUT, ">/dev/null") or die; + select (STDERR); $| = 1; +! print "$perftests[$i+1]: "; + } + + do "sqls/$runtest"; + + # Restore STDERR to $TmpFile +! if ( $test =~ /\.ntm/ ) { + close (STDERR); + open (STDERR, ">>$TmpFile") or die; + } + select (STDERR); $| = 1; + $i++; + } +*************** +*** 128,138 **** + open (TMPF, "<$TmpFile") or die; + open (RESF, ">$ResFile") or die; + +! while () +! { +! $str = $_; +! ($test, $rtime) = split (/:/, $str); +! ($tmp, $rtime, $rest) = split (/[ ]+/, $rtime); +! print RESF "$test: $rtime\n"; + } + +--- 121,130 ---- + open (TMPF, "<$TmpFile") or die; + open (RESF, ">$ResFile") or die; + +! while () { +! if (m/^(.*: ).* ([0-9:.]+) *elapsed/) { +! ($test, $rtime) = ($1, $2); +! print RESF $test, $rtime, "\n"; +! } + } + +------------------------------------------------------------------------ + + +------------------------- testcase detail -------------------------- + +1. from distribution + DBMS: PostgreSQL 6.2b10 + OS: FreeBSD 2.1.5-RELEASE + HardWare: i586/90, 24M RAM, IDE + StartUp: postmaster -B 256 '-o -S 2048' -S + Compiler: gcc 2.6.3 + Compiled: -O, without CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.20 + 8192 INSERTs INTO SIMPLE (1 xact): 39.58 + 8192 INSERTs INTO SIMPLE (8192 xacts): 1190.98 + Create INDEX on SIMPLE: 3.69 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 46.65 + 8192 random INDEX scans on SIMPLE (1 xact): 65.49 + ORDER BY SIMPLE: 2.27 + + +2. run on leslie with asserts + DBMS: PostgreSQL 6.3.2 (plus changes to 98/06/01) + OS: Linux 2.0.32 leslie + HardWare: i586/133 HX 512, 32M RAM, fast SCSI, 7200rpm + StartUp: postmaster -B 256 '-o -S 2048' -S + Compiler: gcc 2.7.2.3 + Compiled: -O, WITH CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.10 + 8192 INSERTs INTO SIMPLE (1 xact): 15.48 + 8192 INSERTs INTO SIMPLE (8192 xacts): 326.75 + Create INDEX on SIMPLE: 2.99 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 20.69 + 8192 random INDEX scans on SIMPLE (1 xact): 35.81 + ORDER BY SIMPLE: 1.68 + + +3. with -F to avoid forced i/o + DBMS: PostgreSQL 6.3.2 (plus changes to 98/06/01) + OS: Linux 2.0.32 leslie + HardWare: i586/133 HX 512, 32M RAM, fast SCSI, 7200rpm + StartUp: postmaster -B 256 '-o -S 2048 -F' -S + Compiler: gcc 2.7.2.3 + Compiled: -O, WITH CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.10 + 8192 INSERTs INTO SIMPLE (1 xact): 15.90 + 8192 INSERTs INTO SIMPLE (8192 xacts): 24.98 + Create INDEX on SIMPLE: 2.63 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 20.46 + 8192 random INDEX scans on SIMPLE (1 xact): 36.43 + ORDER BY SIMPLE: 1.69 + + +4. no asserts, -F to avoid forced I/O + DBMS: PostgreSQL 6.3.2 (plus changes to 98/06/01) + OS: Linux 2.0.32 leslie + HardWare: i586/133 HX 512, 32M RAM, fast SCSI, 7200rpm + StartUp: postmaster -B 256 '-o -S 2048' -S + Compiler: gcc 2.7.2.3 + Compiled: -O, No CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.10 + 8192 INSERTs INTO SIMPLE (1 xact): 14.92 + 8192 INSERTs INTO SIMPLE (8192 xacts): 23.23 + Create INDEX on SIMPLE: 1.38 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 18.67 + 8192 random INDEX scans on SIMPLE (1 xact): 33.79 + ORDER BY SIMPLE: 1.58 + + +5. with more buffers (2048 vs 256) and -F to avoid forced i/o + DBMS: PostgreSQL 6.3.2 (plus changes to 98/06/01) + OS: Linux 2.0.32 leslie + HardWare: i586/133 HX 512, 32M RAM, fast SCSI, 7200rpm + StartUp: postmaster -B 2048 '-o -S 2048 -F' -S + Compiler: gcc 2.7.2.3 + Compiled: -O, WITH CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.11 + 8192 INSERTs INTO SIMPLE (1 xact): 21.31 + 8192 INSERTs INTO SIMPLE (8192 xacts): 42.28 + Create INDEX on SIMPLE: 2.65 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 25.74 + 8192 random INDEX scans on SIMPLE (1 xact): 42.26 + ORDER BY SIMPLE: 1.72 + + +6. No Asserts, more buffers (2048 vs 256) and -F to avoid forced i/o + DBMS: PostgreSQL 6.3.2 (plus changes to 98/06/01) + OS: Linux 2.0.32 leslie + HardWare: i586/133 HX 512, 32M RAM, fast SCSI, 7200rpm + StartUp: postmaster -B 2048 '-o -S 2048 -F' -S + Compiler: gcc 2.7.2.3 + Compiled: -O, No CASSERT checking, with + -DTBL_FREE_CMD_MEMORY (to free memory + if BEGIN/END after each query execution) + DB connection startup: 0.11 + 8192 INSERTs INTO SIMPLE (1 xact): 20.52 + 8192 INSERTs INTO SIMPLE (8192 xacts): 39.79 + Create INDEX on SIMPLE: 1.40 + 8192 INSERTs INTO SIMPLE with INDEX (1 xact): 24.77 + 8192 random INDEX scans on SIMPLE (1 xact): 39.51 + ORDER BY SIMPLE: 1.55 +--------------------------------------------------------------------- + +-dg + +David Gould dg@illustra.com 510.628.3783 or 510.305.9468 +Informix Software (No, really) 300 Lakeside Drive Oakland, CA 94612 +"Don't worry about people stealing your ideas. If your ideas are any + good, you'll have to ram them down people's throats." -- Howard Aiken + + diff --git a/doc/TODO.detail/persistent b/doc/TODO.detail/persistent new file mode 100644 index 0000000000..6e69485fa4 --- /dev/null +++ b/doc/TODO.detail/persistent @@ -0,0 +1,102 @@ +From owner-pgsql-hackers@hub.org Mon May 11 11:31:09 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id LAA03006 + for ; Mon, 11 May 1998 11:31:07 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id LAA01663 for ; Mon, 11 May 1998 11:24:42 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id LAA21841; Mon, 11 May 1998 11:15:25 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 11 May 1998 11:15:12 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id LAA21683 for pgsql-hackers-outgoing; Mon, 11 May 1998 11:15:09 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by hub.org (8.8.8/8.7.5) with ESMTP id LAA21451 for ; Mon, 11 May 1998 11:15:03 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.8.5/8.8.5) with ESMTP id LAA24915; + Mon, 11 May 1998 11:14:43 -0400 (EDT) +To: Brett McCormick +cc: hackers@postgreSQL.org +Subject: Re: [HACKERS] Re: [PATCHES] Try again: S_LOCK reduced contentionh] +In-reply-to: Your message of Mon, 11 May 1998 07:57:23 -0700 (PDT) + <13655.4384.345723.466046@abraxas.scene.com> +Date: Mon, 11 May 1998 11:14:43 -0400 +Message-ID: <24913.894899683@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + +Brett McCormick writes: +> same way that the current network socket is passed -- through an execv +> argument. hopefully, however, the non-execv()ing fork will be in 6.4. + +Um, you missed the point, Brett. David was hoping to transfer a client +connection from the postmaster to an *already existing* backend process. +Fork, with or without exec, solves the problem for a backend that's +started after the postmaster has accepted the client socket. + +This does lead to a different line of thought, however. Pre-started +backends would have access to the "master" connection socket on which +the postmaster listens for client connections, right? Suppose that we +fire the postmaster as postmaster, and demote it to being simply a +manufacturer of new backend processes as old ones get used up. Have +one of the idle backend processes be the one doing the accept() on the +master socket. Once it has a client connection, it performs the +authentication handshake and then starts serving the client (or just +quits if authentication fails). Meanwhile the next idle backend process +has executed accept() on the master socket and is waiting for the next +client; and shortly the postmaster/factory/whateverwecallitnow notices +that it needs to start another backend to add to the idle-backend pool. + +This'd probably need some interlocking among the backends. I have no +idea whether it'd be safe to have all the idle backends trying to +do accept() on the master socket simultaneously, but it sounds risky. +Better to use a mutex so that only one gets to do it while the others +sleep. + + regards, tom lane + + +From owner-pgsql-hackers@hub.org Mon May 11 11:35:55 1998 +Received: from hub.org (hub.org [209.47.148.200]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id LAA03043 + for ; Mon, 11 May 1998 11:35:53 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id LAA23494; Mon, 11 May 1998 11:27:10 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Mon, 11 May 1998 11:27:02 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id LAA23473 for pgsql-hackers-outgoing; Mon, 11 May 1998 11:27:01 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by hub.org (8.8.8/8.7.5) with ESMTP id LAA23462 for ; Mon, 11 May 1998 11:26:56 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.8.5/8.8.5) with ESMTP id LAA25006; + Mon, 11 May 1998 11:26:44 -0400 (EDT) +To: Brett McCormick +cc: hackers@postgreSQL.org +Subject: Re: [HACKERS] Re: [PATCHES] Try again: S_LOCK reduced contentionh] +In-reply-to: Your message of Mon, 11 May 1998 07:57:23 -0700 (PDT) + <13655.4384.345723.466046@abraxas.scene.com> +Date: Mon, 11 May 1998 11:26:44 -0400 +Message-ID: <25004.894900404@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + +Meanwhile, *I* missed the point about Brett's second comment :-( + +Brett McCormick writes: +> There will have to be some sort of arg parsing in any case, +> considering that you can pass configurable arguments to the backend.. + +If we do the sort of change David and I were just discussing, then the +pre-spawned backend would become responsible for parsing and dealing +with the PGOPTIONS portion of the client's connection request message. +That's just part of shifting the authentication handshake code from +postmaster to backend, so it shouldn't be too hard. + +BUT: the whole point is to be able to initialize the backend before it +is connected to a client. How much of the expensive backend startup +work depends on having the client connection options available? +Any work that needs to know the options will have to wait until after +the client connects. If that means most of the startup work can't +happen in advance anyway, then we're out of luck; a pre-started backend +won't save enough time to be worth the effort. (Unless we are willing +to eliminate or redefine the troublesome options...) + + regards, tom lane + + diff --git a/doc/TODO.detail/pg_shadow b/doc/TODO.detail/pg_shadow new file mode 100644 index 0000000000..fcd6d2fe0e --- /dev/null +++ b/doc/TODO.detail/pg_shadow @@ -0,0 +1,55 @@ +From owner-pgsql-hackers@hub.org Sun Aug 2 20:01:13 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id UAA15937 + for ; Sun, 2 Aug 1998 20:01:11 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id TAA01026 for ; Sun, 2 Aug 1998 19:33:53 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id TAA19878; Sun, 2 Aug 1998 19:30:59 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sun, 02 Aug 1998 19:28:23 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id TAA19534 for pgsql-hackers-outgoing; Sun, 2 Aug 1998 19:28:22 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (sss.pgh.pa.us [206.210.65.6]) by hub.org (8.8.8/8.7.5) with ESMTP id TAA19521 for ; Sun, 2 Aug 1998 19:28:15 -0400 (EDT) +Received: from sss.sss.pgh.pa.us (localhost [127.0.0.1]) + by sss.sss.pgh.pa.us (8.9.1/8.9.1) with ESMTP id TAA22594 + for ; Sun, 2 Aug 1998 19:28:13 -0400 (EDT) +To: pgsql-hackers@postgreSQL.org +Subject: [HACKERS] TODO item: make pg_shadow updates more robust +Date: Sun, 02 Aug 1998 19:28:13 -0400 +Message-ID: <22591.902100493@sss.pgh.pa.us> +From: Tom Lane +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: ROr + +I learned the hard way last night that the postmaster's password +authentication routines don't look at the pg_shadow table. They +look at a separate file named pg_pwd, which certain backend operations +will update from pg_shadow. (This is not documented in any user +documentation that I could find; I had to burrow into +src/backend/commands/user.c to discover it.) + +Unfortunately, if a clueless dbadmin (like me ;-)) tries to update +password data with the obvious thing, + update pg_shadow set passwd = 'xxxxx' where usename = 'yyyy'; +pg_pwd doesn't get fixed. + +A more drastic problem is that pg_dump believes it can save and +restore pg_shadow data using "copy". Following an initdb and restore +from a pg_dump -z script, pg_shadow will look just fine, but only +the database admin will be listed in pg_pwd. This is likely to provoke +some confusion, IMHO. + +As a short-term thing, the fact that you *must* set passwords with +ALTER USER ought to be documented, preferably someplace where a +dbadmin who's never heard of ALTER USER is likely to find it. + +As a longer-term thing, I think it would be far better if ordinary +SQL operations on pg_shadow just did the right thing. Wouldn't it +be possible to implement copying to pg_pwd by means of a trigger on +pg_shadow updates, or something like that? + +(I'm afraid that pg_dump -z is pretty well broken for operations on +a password-protected database, btw. Has anyone used it successfully +in that situation?) + + regards, tom lane + + diff --git a/doc/TODO.detail/prepare b/doc/TODO.detail/prepare new file mode 100644 index 0000000000..de5b0bd4f9 --- /dev/null +++ b/doc/TODO.detail/prepare @@ -0,0 +1,98 @@ +From owner-pgsql-hackers@hub.org Wed Nov 18 14:40:49 1998 +Received: from hub.org (majordom@hub.org [209.47.148.200]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id OAA29743 + for ; Wed, 18 Nov 1998 14:40:36 -0500 (EST) +Received: from localhost (majordom@localhost) + by hub.org (8.9.1/8.9.1) with SMTP id OAA03716; + Wed, 18 Nov 1998 14:37:04 -0500 (EST) + (envelope-from owner-pgsql-hackers@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 18 Nov 1998 14:34:39 +0000 (EST) +Received: (from majordom@localhost) + by hub.org (8.9.1/8.9.1) id OAA03395 + for pgsql-hackers-outgoing; Wed, 18 Nov 1998 14:34:37 -0500 (EST) + (envelope-from owner-pgsql-hackers@postgreSQL.org) +Received: from orion.SAPserv.Hamburg.dsh.de (Tpolaris2.sapham.debis.de [53.2.131.8]) + by hub.org (8.9.1/8.9.1) with SMTP id OAA03381 + for ; Wed, 18 Nov 1998 14:34:31 -0500 (EST) + (envelope-from wieck@sapserv.debis.de) +Received: by orion.SAPserv.Hamburg.dsh.de + for pgsql-hackers@hub.org + id m0zgDnj-000EBTC; Wed, 18 Nov 98 21:02 MET +Message-Id: +From: jwieck@debis.com (Jan Wieck) +Subject: Re: [HACKERS] PREPARE +To: meskes@usa.net (Michael Meskes) +Date: Wed, 18 Nov 1998 21:02:06 +0100 (MET) +Cc: pgsql-hackers@hub.org +Reply-To: jwieck@debis.com (Jan Wieck) +In-Reply-To: <19981118084843.B869@usa.net> from "Michael Meskes" at Nov 18, 98 08:48:43 am +X-Mailer: ELM [version 2.4 PL25] +Content-Type: text +Sender: owner-pgsql-hackers@postgreSQL.org +Precedence: bulk +Status: RO + +Michael Meskes wrote: + +> +> On Wed, Nov 18, 1998 at 03:23:30AM +0000, Thomas G. Lockhart wrote: +> > > I didn't get this one completly. What input do you mean? +> > +> > Just the original string/query to be prepared... +> +> I see. But wouldn't it be more useful to preprocess the query and store the +> resulting nodes instead? We don't want to parse the statement everytime a +> variable binding comes in. + + Right. A real improvement would only be to have the prepared + execution plan in the backend and just giving the parameter + values. + + I can think of the following construct: + + PREPARE optimizable-statement; + + That one will run parser/rewrite/planner, create a new memory + context with a unique identifier and saves the querytree's + and plan's in it. Parameter values are identified by the + usual $n notation. The command returns the identifier. + + EXECUTE QUERY identifier [value [, ...]]; + + then get's back the prepared plan and querytree by the id, + creates an executor context with the given values in the + parameter array and calls ExecutorRun() for them. + + The PREPARE needs to analyze the resulting parsetrees to get + the datatypes (and maybe atttypmod's) of the parameters, so + EXECUTE QUERY can convert the values into Datum's using the + types input functions. And the EXECUTE has to be handled + special in tcop (it's something between a regular query and + an utility statement). But it's not too hard to implement. + + Finally a + + FORGET QUERY identifier; + + (don't remember how the others named it) will remove the + prepared plan etc. simply by destroying the memory context + and dropping the identifier from the id->mcontext+prepareinfo + mapping. + + This all restricts the usage of PREPARE to optimizable + statements. Is it required to be able to prepare utility + statements (like CREATE TABLE or so) too? + + +Jan + +-- + +#======================================================================# +# It's easier to get forgiveness for being wrong than for being right. # +# Let's break this rule - forgive me. # +#======================================== jwieck@debis.com (Jan Wieck) # + + + + diff --git a/doc/TODO.detail/primary b/doc/TODO.detail/primary new file mode 100644 index 0000000000..a5bd4cf264 --- /dev/null +++ b/doc/TODO.detail/primary @@ -0,0 +1,159 @@ +From owner-pgsql-hackers@hub.org Fri Sep 4 00:47:06 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id AAA01047 + for ; Fri, 4 Sep 1998 00:47:05 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id XAA02044 for ; Thu, 3 Sep 1998 23:11:07 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id XAA27418; Thu, 3 Sep 1998 23:06:16 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Thu, 03 Sep 1998 23:04:11 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id XAA27185 for pgsql-hackers-outgoing; Thu, 3 Sep 1998 23:04:09 -0400 (EDT) +Received: from dune.krs.ru (dune.krs.ru [195.161.16.38]) by hub.org (8.8.8/8.7.5) with ESMTP id XAA27169 for ; Thu, 3 Sep 1998 23:03:59 -0400 (EDT) +Received: from krs.ru (localhost.krs.ru [127.0.0.1]) + by dune.krs.ru (8.8.8/8.8.8) with ESMTP id LAA10059; + Fri, 4 Sep 1998 11:03:00 +0800 (KRSS) + (envelope-from vadim@krs.ru) +Message-ID: <35EF5864.E5142D35@krs.ru> +Date: Fri, 04 Sep 1998 11:03:00 +0800 +From: Vadim Mikheev +Organization: OJSC Rostelecom (Krasnoyarsk) +X-Mailer: Mozilla 4.05 [en] (X11; I; FreeBSD 2.2.6-RELEASE i386) +MIME-Version: 1.0 +To: "D'Arcy J.M. Cain" +CC: "Thomas G. Lockhart" , hackers@postgreSQL.org +Subject: Re: [HACKERS] Adding PRIMARY KEY info +References: +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: RO + +D'Arcy J.M. Cain wrote: +> +> Thus spake Vadim Mikheev +> > Imho, indices should be used/created for FOREIGN keys and so pg_index +> > is good place for both PRIMARY and FOREIGN keys infos. +> +> Are you sure? I don't know about implementing it but it seems more +> like an attribute thing rather than an index thing. Certainly from a +> database design viewpoint you want to refer to the fields, not the +> index on them. If you put it into the index then you have to do +> an extra join to get the information. +> +> Perhaps you have to do the extra join anyway for other purposes so it +> may not matter. All I want is to be able to be able to extract the +> field that the designer specified as the key. As long as I can design +> a select statement that gives me that I don't much care how it is +> implemented. I'll cache the information anyway so it won't have a +> huge impact on my programs. + +First, let me note that you have to add int28 field to pg_class, +not just oid field, to know what attributeS are in primary key +(we support multi-attribute primary keys). +This could be done... +But what about foreign and unique (!) keys ? +There may be _many_ foreign/unique keys defined for one table! +And so foreign/unique keys info have to be stored somewhere else, +not in pg_class. + +pg_index is good place for all _3_ key types because of: + +1. index should be created for each foreign key - + just for performance. +2. pg_index already has int28 field for key attributes. +3. pg_index already has indisunique (note that foreign keys + may reference unique keys, not just primary ones). + +- so we have just add two fields to pg_index: + +bool indisprimary; +oid indreferenced; +^^^^^^^^^^^^^^^^^^ +this is for foreign keys: oid of referenced relation' +primary/unique key index. + +I agreed that indices are just implementation... +If you don't like to store key infos in pg_index then +new pg_key relation have to be added... + +Comments ? + +Vadim + + +From owner-pgsql-hackers@hub.org Sat Sep 5 02:01:13 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.8.5/8.8.5) with ESMTP id CAA14437 + for ; Sat, 5 Sep 1998 02:01:11 -0400 (EDT) +Received: from hub.org (hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id BAA09928 for ; Sat, 5 Sep 1998 01:48:32 -0400 (EDT) +Received: from localhost (majordom@localhost) by hub.org (8.8.8/8.7.5) with SMTP id BAA18282; Sat, 5 Sep 1998 01:43:16 -0400 (EDT) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Sat, 05 Sep 1998 01:41:40 +0000 (EDT) +Received: (from majordom@localhost) by hub.org (8.8.8/8.7.5) id BAA18241 for pgsql-hackers-outgoing; Sat, 5 Sep 1998 01:41:38 -0400 (EDT) +Received: from dune.krs.ru (dune.krs.ru [195.161.16.38]) by hub.org (8.8.8/8.7.5) with ESMTP id BAA18211; Sat, 5 Sep 1998 01:41:21 -0400 (EDT) +Received: from krs.ru (localhost.krs.ru [127.0.0.1]) + by dune.krs.ru (8.8.8/8.8.8) with ESMTP id NAA20555; + Sat, 5 Sep 1998 13:40:44 +0800 (KRSS) + (envelope-from vadim@krs.ru) +Message-ID: <35F0CEDB.AD721090@krs.ru> +Date: Sat, 05 Sep 1998 13:40:43 +0800 +From: Vadim Mikheev +Organization: OJSC Rostelecom (Krasnoyarsk) +X-Mailer: Mozilla 4.05 [en] (X11; I; FreeBSD 2.2.6-RELEASE i386) +MIME-Version: 1.0 +To: "D'Arcy J.M. Cain" +CC: hackers@postgreSQL.org, pgsql-core@postgreSQL.org +Subject: Re: [HACKERS] Adding PRIMARY KEY info +References: +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Sender: owner-pgsql-hackers@hub.org +Precedence: bulk +Status: ROr + +D'Arcy J.M. Cain wrote: +> +> > +> > pg_index is good place for all _3_ key types because of: +> > +> > 1. index should be created for each foreign key - +> > just for performance. +> > 2. pg_index already has int28 field for key attributes. +> > 3. pg_index already has indisunique (note that foreign keys +> > may reference unique keys, not just primary ones). +> > +> > - so we have just add two fields to pg_index: +> > +> > bool indisprimary; +> > oid indreferenced; +> > ^^^^^^^^^^^^^^^^^^ +> > this is for foreign keys: oid of referenced relation' +> > primary/unique key index. +> +> Sounds fine to me. Any chance of seeing this in 6.4? + +I could add this (and FOREIGN key implementation) before +11-13 Sep... But not the ALTER TABLE ADD/DROP CONSTRAINT +stuff (ok for Entry SQL). +But we are in beta... + +Comments? + +> Nope, pg_index is fine by me. Now, once we have this, how do we find +> the index for a particular attribute? I can't seem to figure out the +> relationship between pg_attribute and pg_index. The chart in the docs +> suggests that indkey is the relation but I can't see any useful info +> there for joining the tables. + +pg_index: + indrelid - oid of indexed relation + indkey - up to the 8 attnums + +pg_attribute: + attrelid - oid of relation + attnum - ... + +Without outer join you have to query pg_attribute for each +valid attnum from pg_index->indkey -:( + +Vadim + + diff --git a/doc/TODO.detail/tcl_arrays b/doc/TODO.detail/tcl_arrays new file mode 100644 index 0000000000..3a20deed38 --- /dev/null +++ b/doc/TODO.detail/tcl_arrays @@ -0,0 +1,240 @@ +From owner-pgsql-patches@hub.org Wed Oct 14 17:31:26 1998 +Received: from renoir.op.net (root@renoir.op.net [209.152.193.4]) + by candle.pha.pa.us (8.9.0/8.9.0) with ESMTP id RAA01594 + for ; Wed, 14 Oct 1998 17:31:24 -0400 (EDT) +Received: from hub.org (majordom@hub.org [209.47.148.200]) by renoir.op.net (o1/$Revision: 1.1 $) with ESMTP id RAA01745 for ; Wed, 14 Oct 1998 17:12:28 -0400 (EDT) +Received: from localhost (majordom@localhost) + by hub.org (8.8.8/8.8.8) with SMTP id RAA06607; + Wed, 14 Oct 1998 17:10:43 -0400 (EDT) + (envelope-from owner-pgsql-patches@hub.org) +Received: by hub.org (TLB v0.10a (1.23 tibbs 1997/01/09 00:29:32)); Wed, 14 Oct 1998 17:10:27 +0000 (EDT) +Received: (from majordom@localhost) + by hub.org (8.8.8/8.8.8) id RAA06562 + for pgsql-patches-outgoing; Wed, 14 Oct 1998 17:10:26 -0400 (EDT) + (envelope-from owner-pgsql-patches@postgreSQL.org) +X-Authentication-Warning: hub.org: majordom set sender to owner-pgsql-patches@postgreSQL.org using -f +Received: from mambo.cs.unitn.it (mambo.cs.unitn.it [193.205.199.204]) + by hub.org (8.8.8/8.8.8) with SMTP id RAA06494 + for ; Wed, 14 Oct 1998 17:10:01 -0400 (EDT) + (envelope-from dz@cs.unitn.it) +Received: from nikita.wizard.net (ts-slip31.gelso.unitn.it [193.205.200.31]) by mambo.cs.unitn.it (8.6.12/8.6.12) with ESMTP id XAA20316 for ; Wed, 14 Oct 1998 23:09:52 +0200 +Received: (from dz@localhost) by nikita.wizard.net (8.8.5/8.6.9) id WAA00489 for pgsql-patches@postgreSQL.org; Wed, 14 Oct 1998 22:56:58 +0200 +From: Massimo Dal Zotto +Message-Id: <199810142056.WAA00489@nikita.wizard.net> +Subject: [PATCHES] TCL_ARRAYS +To: pgsql-patches@postgreSQL.org (Pgsql Patches) +Date: Wed, 14 Oct 1998 22:56:58 +0200 (MET DST) +X-Mailer: ELM [version 2.4 PL24 ME4] +MIME-Version: 1.0 +Content-Type: text/plain; charset=iso-8859-1 +Content-Transfer-Encoding: 8bit +Sender: owner-pgsql-patches@postgreSQL.org +Precedence: bulk +Status: RO + +Hi, + +I have written this patch which fixes some problems with TCL_ARRAYS. +The new array code uses a temporary buffer and is disabled by default +because it depends on contrib/string-io which most of you don't use. +This raises once again the problem of backslashes/escapes and various +ambiguities in pgsql output. I hope this will be solved in 6.5. + +*** src/interfaces/libpgtcl/pgtclCmds.c.orig Mon Sep 21 09:00:19 1998 +--- src/interfaces/libpgtcl/pgtclCmds.c Wed Oct 14 15:32:21 1998 +*************** +*** 602,616 **** + { + for (i = 0; i < PQnfields(result); i++) + { + sprintf(nameBuffer, "%d,%.200s", tupno, PQfname(result, i)); + if (Tcl_SetVar2(interp, arrVar, nameBuffer, +! #ifdef TCL_ARRAYS +! tcl_value(PQgetvalue(result, tupno, i)), + #else + PQgetvalue(result, tupno, i), +- #endif + TCL_LEAVE_ERR_MSG) == NULL) + return TCL_ERROR; + } + } + Tcl_AppendResult(interp, arrVar, 0); +--- 602,624 ---- + { + for (i = 0; i < PQnfields(result); i++) + { ++ #ifdef TCL_ARRAYS ++ char *buff = strdup(PQgetvalue(result, tupno, i)); + sprintf(nameBuffer, "%d,%.200s", tupno, PQfname(result, i)); + if (Tcl_SetVar2(interp, arrVar, nameBuffer, +! tcl_value(buff), +! TCL_LEAVE_ERR_MSG) == NULL) { +! free(buff); +! return TCL_ERROR; +! } +! free(buff); + #else ++ sprintf(nameBuffer, "%d,%.200s", tupno, PQfname(result, i)); ++ if (Tcl_SetVar2(interp, arrVar, nameBuffer, + PQgetvalue(result, tupno, i), + TCL_LEAVE_ERR_MSG) == NULL) + return TCL_ERROR; ++ #endif + } + } + Tcl_AppendResult(interp, arrVar, 0); +*************** +*** 636,643 **** + */ + for (tupno = 0; tupno < PQntuples(result); tupno++) + { + const char *field0 = PQgetvalue(result, tupno, 0); +! char * workspace = malloc(strlen(field0) + strlen(appendstr) + 210); + + for (i = 1; i < PQnfields(result); i++) + { +--- 644,674 ---- + */ + for (tupno = 0; tupno < PQntuples(result); tupno++) + { ++ #ifdef TCL_ARRAYS ++ char *buff = strdup(PQgetvalue(result, tupno, 0)); ++ const char *field0 = tcl_value(buff); ++ char *workspace = malloc(strlen(field0) + 210 + strlen(appendstr)); ++ ++ for (i = 1; i < PQnfields(result); i++) ++ { ++ free(buff); ++ buff = strdup(PQgetvalue(result, tupno, i)); ++ sprintf(workspace, "%s,%.200s%s", field0, PQfname(result,i), ++ appendstr); ++ if (Tcl_SetVar2(interp, arrVar, workspace, ++ tcl_value(buff), ++ TCL_LEAVE_ERR_MSG) == NULL) ++ { ++ free(buff); ++ free(workspace); ++ return TCL_ERROR; ++ } ++ } ++ free(buff); ++ free(workspace); ++ #else + const char *field0 = PQgetvalue(result, tupno, 0); +! char *workspace = malloc(strlen(field0) + 210 + strlen(appendstr)); + + for (i = 1; i < PQnfields(result); i++) + { +*************** +*** 652,657 **** +--- 683,689 ---- + } + } + free(workspace); ++ #endif + } + Tcl_AppendResult(interp, arrVar, 0); + return TCL_OK; +*************** +*** 669,676 **** +--- 701,716 ---- + Tcl_AppendResult(interp, "argument to getTuple cannot exceed number of tuples - 1", 0); + return TCL_ERROR; + } ++ #ifdef TCL_ARRAYS ++ for (i = 0; i < PQnfields(result); i++) { ++ char *buff = strdup(PQgetvalue(result, tupno, i)); ++ Tcl_AppendElement(interp, tcl_value(buff)); ++ free(buff); ++ } ++ #else + for (i = 0; i < PQnfields(result); i++) + Tcl_AppendElement(interp, PQgetvalue(result, tupno, i)); ++ #endif + return TCL_OK; + } + else if (strcmp(opt, "-tupleArray") == 0) +*************** +*** 688,697 **** +--- 728,748 ---- + } + for (i = 0; i < PQnfields(result); i++) + { ++ #ifdef TCL_ARRAYS ++ char *buff = strdup(PQgetvalue(result, tupno, i)); ++ if (Tcl_SetVar2(interp, argv[4], PQfname(result, i), ++ tcl_value(buff), ++ TCL_LEAVE_ERR_MSG) == NULL) { ++ free(buff); ++ return TCL_ERROR; ++ } ++ free(buff); ++ #else + if (Tcl_SetVar2(interp, argv[4], PQfname(result, i), + PQgetvalue(result, tupno, i), + TCL_LEAVE_ERR_MSG) == NULL) + return TCL_ERROR; ++ #endif + } + return TCL_OK; + } +*************** +*** 1303,1310 **** + sprintf(buffer, "%d", tupno); + Tcl_SetVar2(interp, argv[3], ".tupno", buffer, 0); + + for (column = 0; column < ncols; column++) +! Tcl_SetVar2(interp, argv[3], info[column].cname, PQgetvalue(result, tupno, column), 0); + + Tcl_SetVar2(interp, argv[3], ".command", "update", 0); + +--- 1354,1371 ---- + sprintf(buffer, "%d", tupno); + Tcl_SetVar2(interp, argv[3], ".tupno", buffer, 0); + ++ #ifdef TCL_ARRAYS ++ for (column = 0; column < ncols; column++) { ++ char *buff = strdup(PQgetvalue(result, tupno, column)); ++ Tcl_SetVar2(interp, argv[3], info[column].cname, ++ tcl_value(buff), 0); ++ free(buff); ++ } ++ #else + for (column = 0; column < ncols; column++) +! Tcl_SetVar2(interp, argv[3], info[column].cname, +! PQgetvalue(result, tupno, column), 0); +! #endif + + Tcl_SetVar2(interp, argv[3], ".command", "update", 0); + +*** src/include/config.h.in.orig Wed Aug 26 09:01:16 1998 +--- src/include/config.h.in Wed Oct 14 22:44:00 1998 +*************** +*** 312,318 **** + * of postgres C-like arrays, for example {{"a1" "a2"} {"b1" "b2"}} instead + * of {{"a1","a2"},{"b1","b2"}}. + */ +! #define TCL_ARRAYS + + /* + * The following flag allows limiting the number of rows returned by a query. +--- 312,318 ---- + * of postgres C-like arrays, for example {{"a1" "a2"} {"b1" "b2"}} instead + * of {{"a1","a2"},{"b1","b2"}}. + */ +! /* #define TCL_ARRAYS */ + + /* + * The following flag allows limiting the number of rows returned by a query. + +-- +Massimo Dal Zotto + ++----------------------------------------------------------------------+ +| Massimo Dal Zotto email: dz@cs.unitn.it | +| Via Marconi, 141 phone: ++39-461-534251 | +| 38057 Pergine Valsugana (TN) www: http://www.cs.unitn.it/~dz/ | +| Italy pgp: finger dz@tango.cs.unitn.it | ++----------------------------------------------------------------------+ + +