Improve efficiency of dblink by using libpq's new row processor API.

This patch provides a test case for libpq's row processor API. contrib/dblink can deal with very large result sets by dumping them into a tuplestore (which can spill to disk) --- but until now, the intermediate storage of the query result in a PGresult meant memory bloat for any large result. Now we use a row processor to convert the data to tuple form and dump it directly into the tuplestore. A limitation is that this only works for plain dblink() queries, not dblink_send_query() followed by dblink_get_result(). In the latter case we don't know the desired tuple rowtype soon enough. While hack solutions to that are possible, a different user-level API would probably be a better answer. Kyotaro Horiguchi, reviewed by Marko Kreen and Tom Lane
2012-04-04 18:39:08 -04:00 · 2012-04-04 18:39:08 -04:00 · 6f922ef88e
parent 92785dac2e
commit 6f922ef88e
2 changed files with 374 additions and 83 deletions
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@ -63,12 +63,28 @@ typedef struct remoteConn
 	bool		newXactForCursor;		/* Opened a transaction for a cursor */
 } remoteConn;

+typedef struct storeInfo
+{
+	FunctionCallInfo fcinfo;
+	Tuplestorestate *tuplestore;
+	AttInMetadata *attinmeta;
+	MemoryContext tmpcontext;
+	char	  **cstrs;
+} storeInfo;
+
 /*
 * Internal declarations
 */
 static Datum dblink_record_internal(FunctionCallInfo fcinfo, bool is_async);
 static void prepTuplestoreResult(FunctionCallInfo fcinfo);
 static void materializeResult(FunctionCallInfo fcinfo, PGresult *res);
+static void materializeQueryResult(FunctionCallInfo fcinfo,
+					   PGconn *conn,
+					   const char *conname,
+					   const char *sql,
+					   bool fail);
+static int storeHandler(PGresult *res, const PGdataValue *columns,
+			 const char **errmsgp, void *param);
 static remoteConn *getConnectionByName(const char *name);
 static HTAB *createConnHash(void);
 static void createNewConnection(const char *name, remoteConn *rconn);
@ -629,100 +645,118 @@ dblink_get_result(PG_FUNCTION_ARGS)
 static Datum
 dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
 {
-	char	   *msg;
-	PGresult   *res = NULL;
-	PGconn	   *conn = NULL;
-	char	   *connstr = NULL;
-	char	   *sql = NULL;
-	char	   *conname = NULL;
-	remoteConn *rconn = NULL;
-	bool		fail = true;	/* default to backward compatible */
-	bool		freeconn = false;
+	PGconn	   *volatile conn = NULL;
+	volatile bool freeconn = false;

 	prepTuplestoreResult(fcinfo);

 	DBLINK_INIT;

-	if (!is_async)
+	PG_TRY();
 	{
-		if (PG_NARGS() == 3)
+		char	   *msg;
+		char	   *connstr = NULL;
+		char	   *sql = NULL;
+		char	   *conname = NULL;
+		remoteConn *rconn = NULL;
+		bool		fail = true;	/* default to backward compatible */
+
+		if (!is_async)
 		{
-			/* text,text,bool */
-			DBLINK_GET_CONN;
-			sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-			fail = PG_GETARG_BOOL(2);
-		}
-		else if (PG_NARGS() == 2)
-		{
-			/* text,text or text,bool */
-			if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
-			{
-				conn = pconn->conn;
-				sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-				fail = PG_GETARG_BOOL(1);
-			}
-			else
+			if (PG_NARGS() == 3)
 			{
+				/* text,text,bool */
 				DBLINK_GET_CONN;
 				sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+				fail = PG_GETARG_BOOL(2);
+			}
+			else if (PG_NARGS() == 2)
+			{
+				/* text,text or text,bool */
+				if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
+				{
+					conn = pconn->conn;
+					sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+					fail = PG_GETARG_BOOL(1);
+				}
+				else
+				{
+					DBLINK_GET_CONN;
+					sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+				}
+			}
+			else if (PG_NARGS() == 1)
+			{
+				/* text */
+				conn = pconn->conn;
+				sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			}
+			else
+				/* shouldn't happen */
+				elog(ERROR, "wrong number of arguments");
+		}
+		else	/* is_async */
+		{
+			/* get async result */
+			if (PG_NARGS() == 2)
+			{
+				/* text,bool */
+				DBLINK_GET_NAMED_CONN;
+				fail = PG_GETARG_BOOL(1);
+			}
+			else if (PG_NARGS() == 1)
+			{
+				/* text */
+				DBLINK_GET_NAMED_CONN;
+			}
+			else
+				/* shouldn't happen */
+				elog(ERROR, "wrong number of arguments");
+		}
+
+		if (!conn)
+			DBLINK_CONN_NOT_AVAIL;
+
+		if (!is_async)
+		{
+			/* synchronous query, use efficient tuple collection method */
+			materializeQueryResult(fcinfo, conn, conname, sql, fail);
+		}
+		else
+		{
+			/* async result retrieval, do it the old way */
+			PGresult   *res = PQgetResult(conn);
+
+			/* NULL means we're all done with the async results */
+			if (res)
+			{
+				if (PQresultStatus(res) != PGRES_COMMAND_OK &&
+					PQresultStatus(res) != PGRES_TUPLES_OK)
+				{
+					dblink_res_error(conname, res, "could not execute query",
+									 fail);
+					/* if fail isn't set, we'll return an empty query result */
+				}
+				else
+				{
+					materializeResult(fcinfo, res);
+				}
 			}
 		}
-		else if (PG_NARGS() == 1)
-		{
-			/* text */
-			conn = pconn->conn;
-			sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-		}
-		else
-			/* shouldn't happen */
-			elog(ERROR, "wrong number of arguments");
 	}
-	else	/* is_async */
+	PG_CATCH();
 	{
-		/* get async result */
-		if (PG_NARGS() == 2)
-		{
-			/* text,bool */
-			DBLINK_GET_NAMED_CONN;
-			fail = PG_GETARG_BOOL(1);
-		}
-		else if (PG_NARGS() == 1)
-		{
-			/* text */
-			DBLINK_GET_NAMED_CONN;
-		}
-		else
-			/* shouldn't happen */
-			elog(ERROR, "wrong number of arguments");
+		/* if needed, close the connection to the database */
+		if (freeconn)
+			PQfinish(conn);
+		PG_RE_THROW();
 	}
+	PG_END_TRY();

-	if (!conn)
-		DBLINK_CONN_NOT_AVAIL;
-
-	/* synchronous query, or async result retrieval */
-	if (!is_async)
-		res = PQexec(conn, sql);
-	else
-	{
-		res = PQgetResult(conn);
-		/* NULL means we're all done with the async results */
-		if (!res)
-			return (Datum) 0;
-	}
-
-	/* if needed, close the connection to the database and cleanup */
+	/* if needed, close the connection to the database */
 	if (freeconn)
 		PQfinish(conn);

-	if (!res ||
-		(PQresultStatus(res) != PGRES_COMMAND_OK &&
-		 PQresultStatus(res) != PGRES_TUPLES_OK))
-	{
-		dblink_res_error(conname, res, "could not execute query", fail);
-		return (Datum) 0;
-	}
-
-	materializeResult(fcinfo, res);
 	return (Datum) 0;
 }

@ -890,6 +924,259 @@ materializeResult(FunctionCallInfo fcinfo, PGresult *res)
 	PG_END_TRY();
 }

+/*
+ * Execute the given SQL command and store its results into a tuplestore
+ * to be returned as the result of the current function.
+ * This is equivalent to PQexec followed by materializeResult, but we make
+ * use of libpq's "row processor" API to reduce per-row overhead.
+ */
+static void
+materializeQueryResult(FunctionCallInfo fcinfo,
+					   PGconn *conn,
+					   const char *conname,
+					   const char *sql,
+					   bool fail)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	PGresult   *volatile res = NULL;
+	storeInfo	sinfo;
+
+	/* prepTuplestoreResult must have been called previously */
+	Assert(rsinfo->returnMode == SFRM_Materialize);
+
+	PG_TRY();
+	{
+		/* initialize storeInfo to empty */
+		memset(&sinfo, 0, sizeof(sinfo));
+		sinfo.fcinfo = fcinfo;
+
+		/* We'll collect tuples using storeHandler */
+		PQsetRowProcessor(conn, storeHandler, &sinfo);
+
+		res = PQexec(conn, sql);
+
+		/* We don't keep the custom row processor installed permanently */
+		PQsetRowProcessor(conn, NULL, NULL);
+
+		if (!res ||
+			(PQresultStatus(res) != PGRES_COMMAND_OK &&
+			 PQresultStatus(res) != PGRES_TUPLES_OK))
+		{
+			/*
+			 * dblink_res_error will clear the passed PGresult, so we need
+			 * this ugly dance to avoid doing so twice during error exit
+			 */
+			PGresult   *res1 = res;
+
+			res = NULL;
+			dblink_res_error(conname, res1, "could not execute query", fail);
+			/* if fail isn't set, we'll return an empty query result */
+		}
+		else if (PQresultStatus(res) == PGRES_COMMAND_OK)
+		{
+			/*
+			 * storeHandler didn't get called, so we need to convert the
+			 * command status string to a tuple manually
+			 */
+			TupleDesc	tupdesc;
+			AttInMetadata *attinmeta;
+			Tuplestorestate *tupstore;
+			HeapTuple	tuple;
+			char	   *values[1];
+			MemoryContext oldcontext;
+
+			/*
+			 * need a tuple descriptor representing one TEXT column to return
+			 * the command status string as our result tuple
+			 */
+			tupdesc = CreateTemplateTupleDesc(1, false);
+			TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
+							   TEXTOID, -1, 0);
+			attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+			oldcontext = MemoryContextSwitchTo(
+									rsinfo->econtext->ecxt_per_query_memory);
+			tupstore = tuplestore_begin_heap(true, false, work_mem);
+			rsinfo->setResult = tupstore;
+			rsinfo->setDesc = tupdesc;
+			MemoryContextSwitchTo(oldcontext);
+
+			values[0] = PQcmdStatus(res);
+
+			/* build the tuple and put it into the tuplestore. */
+			tuple = BuildTupleFromCStrings(attinmeta, values);
+			tuplestore_puttuple(tupstore, tuple);
+
+			PQclear(res);
+		}
+		else
+		{
+			Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
+			/* storeHandler should have created a tuplestore */
+			Assert(rsinfo->setResult != NULL);
+
+			PQclear(res);
+		}
+	}
+	PG_CATCH();
+	{
+		/* be sure to unset the custom row processor */
+		PQsetRowProcessor(conn, NULL, NULL);
+		/* be sure to release any libpq result we collected */
+		if (res)
+			PQclear(res);
+		/* and clear out any pending data in libpq */
+		while ((res = PQskipResult(conn)) != NULL)
+			PQclear(res);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Custom row processor for materializeQueryResult.
+ * Prototype of this function must match PQrowProcessor.
+ */
+static int
+storeHandler(PGresult *res, const PGdataValue *columns,
+			 const char **errmsgp, void *param)
+{
+	storeInfo  *sinfo = (storeInfo *) param;
+	int			nfields = PQnfields(res);
+	char	  **cstrs = sinfo->cstrs;
+	HeapTuple	tuple;
+	char	   *pbuf;
+	int			pbuflen;
+	int			i;
+	MemoryContext oldcontext;
+
+	if (columns == NULL)
+	{
+		/* Prepare for new result set */
+		ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo;
+		TupleDesc	tupdesc;
+
+		/*
+		 * It's possible to get more than one result set if the query string
+		 * contained multiple SQL commands.  In that case, we follow PQexec's
+		 * traditional behavior of throwing away all but the last result.
+		 */
+		if (sinfo->tuplestore)
+			tuplestore_end(sinfo->tuplestore);
+		sinfo->tuplestore = NULL;
+
+		/* get a tuple descriptor for our result type */
+		switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc))
+		{
+			case TYPEFUNC_COMPOSITE:
+				/* success */
+				break;
+			case TYPEFUNC_RECORD:
+				/* failed to determine actual type of RECORD */
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("function returning record called in context "
+								"that cannot accept type record")));
+				break;
+			default:
+				/* result type isn't composite */
+				elog(ERROR, "return type must be a row type");
+				break;
+		}
+
+		/* make sure we have a persistent copy of the tupdesc */
+		tupdesc = CreateTupleDescCopy(tupdesc);
+
+		/* check result and tuple descriptor have the same number of columns */
+		if (nfields != tupdesc->natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("remote query result rowtype does not match "
+							"the specified FROM clause rowtype")));
+
+		/* Prepare attinmeta for later data conversions */
+		sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+		/* Create a new, empty tuplestore */
+		oldcontext = MemoryContextSwitchTo(
+									rsinfo->econtext->ecxt_per_query_memory);
+		sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem);
+		rsinfo->setResult = sinfo->tuplestore;
+		rsinfo->setDesc = tupdesc;
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Set up sufficiently-wide string pointers array; this won't change
+		 * in size so it's easy to preallocate.
+		 */
+		if (sinfo->cstrs)
+			pfree(sinfo->cstrs);
+		sinfo->cstrs = (char **) palloc(nfields * sizeof(char *));
+
+		/* Create short-lived memory context for data conversions */
+		if (!sinfo->tmpcontext)
+			sinfo->tmpcontext =
+				AllocSetContextCreate(CurrentMemoryContext,
+									  "dblink temporary context",
+									  ALLOCSET_DEFAULT_MINSIZE,
+									  ALLOCSET_DEFAULT_INITSIZE,
+									  ALLOCSET_DEFAULT_MAXSIZE);
+
+		return 1;
+	}
+
+	CHECK_FOR_INTERRUPTS();
+
+	/*
+	 * Do the following work in a temp context that we reset after each tuple.
+	 * This cleans up not only the data we have direct access to, but any
+	 * cruft the I/O functions might leak.
+	 */
+	oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext);
+
+	/*
+	 * The strings passed to us are not null-terminated, but the datatype
+	 * input functions we're about to call require null termination.  Copy the
+	 * strings and add null termination.  As a micro-optimization, allocate
+	 * all the strings with one palloc.
+	 */
+	pbuflen = nfields;		/* count the null terminators themselves */
+	for (i = 0; i < nfields; i++)
+	{
+		int			len = columns[i].len;
+
+		if (len > 0)
+			pbuflen += len;
+	}
+	pbuf = (char *) palloc(pbuflen);
+
+	for (i = 0; i < nfields; i++)
+	{
+		int			len = columns[i].len;
+
+		if (len < 0)
+			cstrs[i] = NULL;
+		else
+		{
+			cstrs[i] = pbuf;
+			memcpy(pbuf, columns[i].value, len);
+			pbuf += len;
+			*pbuf++ = '\0';
+		}
+	}
+
+	/* Convert row to a tuple, and add it to the tuplestore */
+	tuple = BuildTupleFromCStrings(sinfo->attinmeta, cstrs);
+
+	tuplestore_puttuple(sinfo->tuplestore, tuple);
+
+	/* Clean up */
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextReset(sinfo->tmpcontext);
+
+	return 1;
+}
+
 /*
 * List all open dblink connections by name.
 * Returns an array of all connection names.
--- a/doc/src/sgml/dblink.sgml
+++ b/doc/src/sgml/dblink.sgml
@ -425,14 +425,6 @@ SELECT *
  <refsect1>
   <title>Notes</title>

-   <para>
-    <function>dblink</> fetches the entire remote query result before
-    returning any of it to the local system.  If the query is expected
-    to return a large number of rows, it's better to open it as a cursor
-    with <function>dblink_open</> and then fetch a manageable number
-    of rows at a time.
-   </para>
-
   <para>
    A convenient way to use <function>dblink</> with predetermined
    queries is to create a view.
@ -1432,6 +1424,18 @@ dblink_get_result(text connname [, bool fail_on_error]) returns setof record
    sent, and one additional time to obtain an empty set result,
    before the connection can be used again.
   </para>
+
+   <para>
+    When using <function>dblink_send_query</> and
+    <function>dblink_get_result</>, <application>dblink</> fetches the entire
+    remote query result before returning any of it to the local query
+    processor.  If the query returns a large number of rows, this can result
+    in transient memory bloat in the local session.  It may be better to open
+    such a query as a cursor with <function>dblink_open</> and then fetch a
+    manageable number of rows at a time.  Alternatively, use plain
+    <function>dblink()</>, which avoids memory bloat by spooling large result
+    sets to disk.
+   </para>
  </refsect1>

  <refsect1>