Roach died. Out Of Memory

I can repeatably kill a roach node with an out of memory error.

Before I start filing an issue I should ask if I have shot myself in the foot.

My three roaches are v1.1.3 and run on 3 identical Google cloud instances with 4.5GB RAM.

I start them with:

cockroach start --certs-dir=certs  --cache=50% --background

Is that 50% too much? It’s not totally clear to me what it means. Seems to be ignored anyway.

As I’m here, here is the code that triggers OOM after about 2500 queries:

const query = `
    UPSERT INTO aggregates_1
        SELECT
            source_id,
            $1::timestamp with time zone + ($2::interval * $3::integer) AS "timestamp",  -- End time of aggregate period
            avg(value) AS average,
            stddev(value) AS standard_deviation,
            count(value) AS count
        FROM series_1
        WHERE "timestamp" >= $1::timestamp + ($2::interval * $3::integer)  AND           -- Start of aggregate period
              "timestamp" <  $1::timestamp + ($2::interval * ($3::integer + 1))          -- End of aggregate period
        GROUP BY source_id;`

const startTimestamp = '2017-09-07 16:00:00+00:00'
const interval = '1 hours'
let offset = 0

function makeAggregates() {
  pool.query(query, [startTimestamp, interval, offset], (err, res) => {
    if (err) {
      console.log('Error:', err)
    } else {
      console.log('OK: ', offset)
      offset += 1;
      makeAggregates()
    }
  })
}

makeAggregates()

@heater, our production recommendations are to increase the --max-sql-memory to at least 25%, specifically for in-memory processing of rows when using ORDER BY, GROUP BY, DISTINCT, joins, and window functions. Can you give that a try and see if it solves the problem?

There’s also the --max-disk-temp-storage flag, which lets you increase the maximum on-disk storage capacity available to store temporary data for SQL queries that exceed the memory budget. The default is 32GiB. @bdarnell or @marc, could increasing the temp storage size also help in situations like this?

@heater in addition to @jesse 's comments, do you mind telling us what the exact OOM error you are getting is?

Sure. I have seen this twice now:

fatal error: runtime: out of memory

runtime stack:
runtime.throw(0x1eb6ee5, 0x16)
	/usr/local/go/src/runtime/panic.go:596 +0x95
runtime.sysMap(0xc510240000, 0x100000, 0x7fe5f71fda00, 0x3265e58)
	/usr/local/go/src/runtime/mem_linux.go:216 +0x1d0
runtime.(*mheap).sysAlloc(0x322b3c0, 0x100000, 0x7fe5f7a15ae8)
	/usr/local/go/src/runtime/malloc.go:440 +0x374
runtime.(*mheap).grow(0x322b3c0, 0x2c, 0x0)
	/usr/local/go/src/runtime/mheap.go:774 +0x62
runtime.(*mheap).allocSpanLocked(0x322b3c0, 0x2c, 0x200)
	/usr/local/go/src/runtime/mheap.go:678 +0x44f
runtime.(*mheap).alloc_m(0x322b3c0, 0x2c, 0x100000000, 0xc400000000)
	/usr/local/go/src/runtime/mheap.go:562 +0xe2
runtime.(*mheap).alloc.func1()
	/usr/local/go/src/runtime/mheap.go:627 +0x4b
runtime.systemstack(0x7fe5f71fdad8)
	/usr/local/go/src/runtime/asm_amd64.s:343 +0xab
runtime.(*mheap).alloc(0x322b3c0, 0x2c, 0x10100000000, 0xc50f2101c8)
	/usr/local/go/src/runtime/mheap.go:628 +0xa0
runtime.largeAlloc(0x58000, 0x7fe5f8c00001, 0xc50000003a)
	/usr/local/go/src/runtime/malloc.go:807 +0x93
runtime.mallocgc.func1()
	/usr/local/go/src/runtime/malloc.go:702 +0x3e
runtime.systemstack(0xc420018000)
	/usr/local/go/src/runtime/asm_amd64.s:327 +0x79
runtime.mstart()
	/usr/local/go/src/runtime/proc.go:1132

goroutine 497203 [running]:
runtime.systemstack_switch()
	/usr/local/go/src/runtime/asm_amd64.s:281 fp=0xc50f210148 sp=0xc50f210140
runtime.mallocgc(0x58000, 0x1cfbea0, 0x1, 0x0)
	/usr/local/go/src/runtime/malloc.go:703 +0x930 fp=0xc50f2101e8 sp=0xc50f210148
runtime.growslice(0x1cfbea0, 0xc5101c6000, 0x1180, 0x1180, 0x1181, 0xc50f210318, 0xcf6653, 0xc50f2791af)
	/usr/local/go/src/runtime/slice.go:147 +0x152 fp=0xc50f210270 sp=0xc50f2101e8
github.com/cockroachdb/cockroach/pkg/storage/engine.mvccScanInternal.func1(0xc50f2791a2, 0xc, 0xd, 0xc50f2791af, 0x8, 0x8, 0x14fbe83564aef692, 0x0, 0x31b7, 0x4000, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/engine/mvcc.go:1566 +0x420 fp=0xc50f210328 sp=0xc50f210270
github.com/cockroachdb/cockroach/pkg/storage/engine.MVCCIterate(0x7fe5f8bcff28, 0xc50f20a210, 0x2d7c0c0, 0xc505ffd7a0, 0xc5100f1695, 0xc, 0xd, 0xc50fff94c0, 0x3, 0x3, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/engine/mvcc.go:1708 +0x9e9 fp=0xc50f2107d8 sp=0xc50f210328
github.com/cockroachdb/cockroach/pkg/storage/engine.mvccScanInternal(0x7fe5f8bcff28, 0xc50f20a210, 0x2d7c0c0, 0xc505ffd7a0, 0xc5100f1695, 0xc, 0xd, 0xc50fff94c0, 0x3, 0x3, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/engine/mvcc.go:1568 +0x321 fp=0xc50f210910 sp=0xc50f2107d8
github.com/cockroachdb/cockroach/pkg/storage/engine.MVCCScan(0x7fe5f8bcff28, 0xc50f20a210, 0x2d7c0c0, 0xc505ffd7a0, 0xc5100f1695, 0xc, 0xd, 0xc50fff94c0, 0x3, 0x3, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/engine/mvcc.go:1590 +0xf8 fp=0xc50f2109e8 sp=0xc50f210910
github.com/cockroachdb/cockroach/pkg/storage.evalScan(0x7fe5f8bcff28, 0xc50f20a210, 0x2d7fce0, 0xc505ffd7a0, 0xc420b65c00, 0xc504c5b920, 0x14fdb3900fad4f53, 0x0, 0x100000001, 0x1, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/replica_command.go:421 +0x1b0 fp=0xc50f210f50 sp=0xc50f2109e8
github.com/cockroachdb/cockroach/pkg/storage.evaluateCommand(0x7fe5f8bcff28, 0xc50f20a210, 0x0, 0x0, 0x0, 0x2d7fce0, 0xc505ffd7a0, 0xc420b65c00, 0xc504c5b920, 0x0, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/replica_command.go:199 +0x2ef fp=0xc50f211780 sp=0xc50f210f50
github.com/cockroachdb/cockroach/pkg/storage.evaluateBatch(0x7fe5f8bcff28, 0xc50f20a210, 0x0, 0x0, 0x2d7fce0, 0xc505ffd7a0, 0xc420b65c00, 0xc504c5b920, 0x0, 0x14fdb3900fad4f53, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/replica.go:5062 +0x332 fp=0xc50f2125d0 sp=0xc50f211780
github.com/cockroachdb/cockroach/pkg/storage.(*Replica).executeReadOnlyBatch(0xc420b65c00, 0x7fe5f8bcff28, 0xc50f20a210, 0x14fdb3900fad4f53, 0x0, 0x100000001, 0x1, 0x76, 0x3ff0000000000000, 0xc505ffd6c0, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/replica.go:2458 +0x4d1 fp=0xc50f212bb0 sp=0xc50f2125d0
github.com/cockroachdb/cockroach/pkg/storage.(*Replica).Send(0xc420b65c00, 0x7fe5f8bcff28, 0xc50f20a210, 0x14fdb3900fad4f53, 0x0, 0x100000001, 0x1, 0x76, 0x3ff0000000000000, 0xc505ffd6c0, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/replica.go:1794 +0x296 fp=0xc50f212d00 sp=0xc50f212bb0
github.com/cockroachdb/cockroach/pkg/storage.(*Store).Send(0xc420694c00, 0x7fe5f8bcff28, 0xc50f20a1e0, 0x14fdb3900fad4f53, 0x0, 0x100000001, 0x1, 0x76, 0x3ff0000000000000, 0xc505ffd6c0, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/store.go:2561 +0x5c8 fp=0xc50f213508 sp=0xc50f212d00
github.com/cockroachdb/cockroach/pkg/storage.(*Stores).Send(0xc4205f42c0, 0x7fe5f8bcff28, 0xc50f20a180, 0x0, 0x0, 0x100000001, 0x1, 0x76, 0x3ff0000000000000, 0xc505ffd6c0, ...)
	/go/src/github.com/cockroachdb/cockroach/pkg/storage/stores.go:186 +0x1ea fp=0xc50f213670 sp=0xc50f213508
github.com/cockroachdb/cockroach/pkg/server.(*Node).batchInternal.func1(0x7fe5f8bcff28, 0xc50f20a180, 0x0, 0x0)
	/go/src/github.com/cockroachdb/cockroach/pkg/server/node.go:842 +0x1a5 fp=0xc50f213768 sp=0xc50f213670
github.com/cockroachdb/cockroach/pkg/util/stop.(*Stopper).RunTaskWithErr(0xc42080d320, 0x7fe5f8bcff28, 0xc50f20a150, 0x1eacbec, 0x10, 0xc50f213810, 0x0, 0x0)
	/go/src/github.com/cockroachdb/cockroach/pkg/util/stop/stopper.go:246 +0xf7 fp=0xc50f2137a0 sp=0xc50f213768
github.com/cockroachdb/cockroach/pkg/server.(*Node).batchInternal(0xc4207b8300, 0x7fe5f8bcff28, 0xc50f20a150, 0xc506d445a8, 0xc50f20a150, 0xc5018ef6c0, 0x1c813c0)
	/go/src/github.com/cockroachdb/cockroach/pkg/server/node.go:853 +0x159 fp=0xc50f213a40 sp=0xc50f2137a0
github.com/cockroachdb/cockroach/pkg/server.(*Node).Batch(0xc4207b8300, 0x7fe5f8bcff28, 0xc50f20a150, 0xc506d445a8, 0x0, 0x0, 0x0)
	/go/src/github.com/cockroachdb/cockroach/pkg/server/node.go:870 +0x9c fp=0xc50f213ac0 sp=0xc50f213a40
github.com/cockroachdb/cockroach/pkg/kv.(*grpcTransport).SendNext.func1.1(0xc50f536770, 0xc506d44580, 0x7fe5f8bd02f8, 0xc5018ef6c0, 0xc50787d780, 0xc42302a100, 0xc50787d7a8)
	/go/src/github.com/cockroachdb/cockroach/pkg/kv/transport.go:224 +0x4a5 fp=0xc50f213f40 sp=0xc50f213ac0
github.com/cockroachdb/cockroach/pkg/kv.(*grpcTransport).SendNext.func1(0xc50f536770, 0xc506d44580, 0x7fe5f8bd02f8, 0xc5018ef6c0, 0xc504c5b860)
	/go/src/github.com/cockroachdb/cockroach/pkg/kv/transport.go:237 +0xad fp=0xc50f213fb8 sp=0xc50f213f40
runtime.goexit()
	/usr/local/go/src/runtime/asm_amd64.s:2197 +0x1 fp=0xc50f213fc0 sp=0xc50f213fb8
created by github.com/cockroachdb/cockroach/pkg/kv.(*grpcTransport).SendNext
	/go/src/github.com/cockroachdb/cockroach/pkg/kv/transport.go:249 +0x19b

goroutine 1 [select, 107 minutes]:
github.com/cockroachdb/cockroach/pkg/cli.runStart(0x2d48500, 0xc4202848e0, 0x0, 0x2, 0x0, 0x0)
	/go/src/github.com/cockroachdb/cockroach/pkg/cli/start.go:599 +0xf70
github.com/cockroachdb/cockroach/pkg/cli.MaybeDecorateGRPCError.func1(0x2d48500, 0xc4202848e0, 0x0, 0x2, 0xc4206b5c90, 0x1791a88)
	/go/src/github.com/cockroachdb/cockroach/pkg/cli/error.go:35 +0x5a
github.com/cockroachdb/cockroach/pkg/cli.MaybeShoutError.func1(0x2d48500, 0xc4202848e0, 0x0, 0x2, 0x0, 0x0)

And a ton more like that.

@jesse no, I believe that if temp storage is exceeded, the query would get an error, not use more memory.

The groupNode/aggregator processor could be using a lot of memory, but the fact that the query works 2500 times before it fails, and that the stack trace is in mvccScanInternal casts doubt on that being the problem. I think it’s worth filing an issue.

I restarted my three roaches with

cockroach start --certs-dir=certs --max-sql-memory=25% --cache=50% --background

As suggested above.

Running the same aggregate query again, many thousands of times, has not yet resulted in a OOM crash in 19 hours.

But the memory usage has been steadily climbing. Top now reports 63% and the cockroach console 2.8, 3.1, 2.8 GB for each node.

There is only 500,000 rows in the table I’m iterating over, and about 9000 in the resulting aggregate table.

I’ll file an issue. Perhaps wait a bit more first though, to what happens next.

The --cache argument specifies how much cache RocksDB (our underlying key-value storage system) uses, as a fraction of total memory on the machine. How much of that is actually used is not easily predictable, but it’s not unusual for it to climb to the maximum size even under low load.

Given that the machines have only 4.5GB RAM, and we use 2.25GB for the cache, there was not much memory left for other things. I’d recommend keeping the cache % lower given the low amount of memory.

Still running after 2 days. Memory usage now up to 67%.

I have changed

const interval = '1 hours'

to

const interval = '15 minutes'

In the code I posted above.

With 500,000 rows in my source table ‘series_1’ that produces 35000 rows in my destination table ‘agregates_1’. After about 9000 queries. It has taken most of the 2 days to do this!

Anyway, what I wanted to say here is that each query is very small. Each SELECT from the source table only returns 15 rows. I imagine that in itself does not require a lot of RAM.

There is nothing else running on those roach nodes.

Clearly something is leaking here. But how to find out what?

After about 9000 queries. It has taken most of the 2 days to do this!

Are you saying that it took 2 days to run 9000 queries? So ~20 seconds per query?

Is there an index on timestamp? If not, we are forced to read the entire table every time which would explain why the queries are slow.

I don’t think there is clear evidence of memory leaking. We can assume that 50% of the machine’s memory will be used up of the RocksDB cache (this cache is used as much as possible, regardless of load). The rest of 17% (~750MB) is not unreasonable for cockroachdb internals (metadata caches, timeseries/statistics etc).

No, my aggregates query, as per first post, is painfully slow but not that slow. I have been truncating my destination table and starting over whenever I have a moment.

The table being averaged has a primary key based on source_id and timestamp, like so:

CREATE TABLE series_1 (
	source_id INTEGER NOT NULL,
	"timestamp" TIMESTAMP WITH TIME ZONE NOT NULL,
	value INTEGER NOT NULL,
	CONSTRAINT "primary" PRIMARY KEY (source_id ASC, "timestamp" ASC),
	CONSTRAINT fk_source_id_ref_sources FOREIGN KEY (source_id) REFERENCES sources (source_id),
	FAMILY "primary" (source_id, "timestamp", value)
)

The table receiving the aggregates looks like so:

CREATE TABLE aggregates_1 (
	source_id INTEGER NOT NULL,
	"timestamp" TIMESTAMP WITH TIME ZONE NOT NULL,
	average DECIMAL NULL,
	standard_deviation DECIMAL NULL,
	count INTEGER NULL,
	CONSTRAINT "primary" PRIMARY KEY (source_id ASC, "timestamp" ASC),
	FAMILY "primary" (source_id, "timestamp", average, standard_deviation, count)
)

Starting over again just now, with an empty aggregates table and a source table of 500,000 rows, I time 100 iterations of the query as taking 5 minutes 33 seconds, that’s 5.1 seconds per iteration.

Meanwhile memory consumption is up to 70.1%

It has been rising a lot slower since the first two times I did this and chnaged the options as per the above post.

This primary key for series_1 isn’t helpful for a WHERE timestamp .. clause, meaning each query still has scan the entire table. I would try adding an index on timestamp

Ah, excuse my SQL noobie status again here. I was under the impression, from reading around, that my primary key was a suitable index.

Anyway, I added an index on timestamp like so:

CREATE INDEX timestamp_idx ON series_1 ("timestamp");

Now the series_1 table looks like:

CREATE TABLE series_1 (
	source_id INTEGER NOT NULL,
	"timestamp" TIMESTAMP WITH TIME ZONE NOT NULL,
	value INTEGER NOT NULL,
	CONSTRAINT "primary" PRIMARY KEY (source_id ASC, "timestamp" ASC),
	INDEX timestamp_idx ("timestamp" ASC),
	CONSTRAINT fk_source_id_ref_sources FOREIGN KEY (source_id) REFERENCES sources (source_id),
	FAMILY "primary" (source_id, "timestamp", value)
)

It has not helped much with the progress of my query, Now 100 iterations takes 8 minutes 26 seconds or 5.06 seconds per iteration.

It probably does not help that my three roaches are in three different Google data centers on the west cost USA and I am in Finland!

The good news is that top now reports 68.3% memory consumption. The first time I have ever seen it go down !

Can you post the output of EXPLAIN <query>? (you can replace the placeholders with some typical values)

You mean like:

EXPLAIN UPSERT INTO aggregates_1
    SELECT
        source_id,
        '2017-10-01'::timestamp with time zone + ('1 hour'::interval * 0::integer) AS "timestamp",  -- End time of aggregate period
        avg(value) AS average,
        stddev(value) AS standard_deviation,
        count(value) AS count
    FROM series_1
    WHERE "timestamp" >= '2017-10-01'::timestamp + ('1 hour'::interval * 0::integer)  AND           -- Start of aggregate period
            "timestamp" <  '2017-10-01'::timestamp + ('1 hour'::interval * (0::integer + 1))          -- End of aggregate period
    GROUP BY source_id;

Which when punched into the sql shell produces this:

> EXPLAIN UPSERT INTO aggregates_1 SELECT source_id, '2017-10-01'::TIMESTAMP WITH TIME ZONE + ('1 hour'::INTERVAL * 0::INTEGER) AS "timestamp", avg(value) AS average, stddev(value) AS standard_deviation, count(value) AS count FROM series_1 WHERE ("timestamp" >= ('2017-10-01'::TIMESTAMP + ('1 hour'::INTERVAL * 0::INTEGER))) AND ("timestamp" < ('2017-10-01'::TIMESTAMP + ('1 hour'::INTERVAL * (0::INTEGER + 1)))) GROUP BY source_id;
+-------+--------+----------+------------------------------------------------------------------------+
| Level |  Type  |  Field   |                              Description                               |
+-------+--------+----------+------------------------------------------------------------------------+
|     0 | insert |          |                                                                        |
|     0 |        | into     | aggregates_1(source_id, timestamp, average, standard_deviation, count) |
|     1 | render |          |                                                                        |
|     2 | group  |          |                                                                        |
|     2 |        | group by | @1-@1                                                                  |
|     3 | render |          |                                                                        |
|     4 | scan   |          |                                                                        |
|     4 |        | table    | series_1@primary                                                       |
|     4 |        | spans    | ALL                                                                    |
+-------+--------+----------+------------------------------------------------------------------------+
(9 rows)

Time: 217.1083ms

Hmm… From what little I understand that “scan … table … series_1@primary” means its using my primary key as an index.

Something is wrong; we should be using the index. Filed https://github.com/cockroachdb/cockroach/issues/20604

Try using ::timestamptz instead of ::timestamp in the query. It seems that the problem is happening because the column is TIMESTAMP WITH TIME ZONE and we’re providing TIMESTAMP boundaries.

Ouch, what is going on there?

My original kind of query:

SELECT * FROM series_1 WHERE ("timestamp" >= ('2017-10-01'::TIMESTAMP + ('1 hour'::INTERVAL * 0::INTEGER))) AND ("timestamp" < ('2017-10-01'::TIMESTAMP + ('1 hour'::INTERVAL * (0::INTEGER + 1))));

Takes over 3 seconds.

Your example with the cruft removed:

SELECT * FROM series_1 WHERE ("timestamp" >= '2017-10-01 00:00:00+00:00') AND ("timestamp" < '2017-10-01 01:00:00+00:00');

Takes 300 or 400 ms !!

Looking at EXPLAIN for those I do see that the latter uses the new index timestamp_idx.

At this rate it would be quicker to parameterize the query using Javascript literal strings …

See my suggestion above - try using ::timestamptz instead of ::timestamp in the query.

Ah, I cannot use timestampz. Nothing but syntax errors. This already confused the socks off me for some time until I discovered “timestamp with time zone” works.

So now, I replaced all my uses of “timestamp” with “timestamp with time zone”.

My query now looks like this:

UPSERT INTO aggregates_1
        SELECT
            source_id,
            $1::timestamp with time zone + ($2::interval * $3::integer) AS "timestamp",  -- End time of aggregate period
            avg(value) AS average,
            stddev(value) AS standard_deviation,
            count(value) AS count
        FROM series_1
        WHERE "timestamp" >= $1::timestamp with time zone + ($2::interval * $3::integer)  AND           -- Start of aggregate period
              "timestamp" <  $1::timestamp with time zone + ($2::interval * ($3::integer + 1))          -- End of aggregate period
        GROUP BY source_id;

Blow me down it’s ten times faster. 49 seconds for a hundred iterations. Or 490ms per query!

Did I mention before, you guys are hot!

Thanks.