The Directed Acyclic Graph (DAG) is a concept I run into over-and-over again; which is, perhaps, somewhat ironic.
A DAG is a representation of vertices (nodes) that are connected by directional edges (arcs—i.e., lines) in such a way that there are no cycles (e.g., starting at Node A
, you shouldn’t be able to return to Node A
).
DAGs have lots of uses in computer science problems and in discrete mathematics. You’ll find DAGs in build-systems, network problems, and, importantly (for this blog post, if not generally) in Git.
One way to think of a DAG is as a set of dependencies—each node may have a dependency on one or more other nodes. That is, in order to get to Node B
you must route through Node A
, so Node B
depends on Node A
:
The visualization of dependencies in a JSON object is (SURPRISE!) different from the input format needed to visualize a DAG using the D3.js Force layout. To change the above object into Force’s expected input, I created a little helper function:
var forceFormat = function(dag) {
var orderedNodes = [],
nodes = [],
links = [],
usesPack = false;
// Basically a dumb Object.keys
for (node in dag) {
if ( !dag.hasOwnProperty( node ) ) continue;
orderedNodes.push(node);
}
orderedNodes.forEach(function(node) {
var sources = dag[node];
if (!sources) return;
sources.forEach(function(source) {
var source = orderedNodes.indexOf(source);
// If the source isn't in the Git DAG, it's in a packfile
if (source < 0) {
if (usesPack) return;
source = orderedNodes.length;
usesPack = true;
}
links.push({
'source': source,
'target': orderedNodes.indexOf(node)
});
});
nodes.push({'name': node});
});
// Add pack file to end of list
if (usesPack) nodes.push({'name': 'PACK'});
return { 'nodes': nodes, 'links': links };
};
var forceInput = forceFormat(dag);
forceFormat
outputs a JSON object that can be used as input for the Force layout.
{
"links": [
{
"source": "Node A",
"target": "Node B"
}
],
"nodes": [
{ "name": "Node A" },
{ "name": "Node B" }
]
}
I can pass this resulting JSON object off to a function that I created after a long time staring at one of mbostock’s many amazing examples to create a D3 Force graph of verticies and edges:
// http://bl.ocks.org/mbostock/1138500
var makeGraph = function(target, graphData) {
var target = d3.select(target),
bounds = target.node().getBoundingClientRect(),
fill = d3.scale.category20(),
radius = 25;
var svg = target.append('svg')
.attr('width', bounds.width)
.attr('height', bounds.height);
// Arrow marker for end-of-line arrow
svg.append('defs').append('marker')
.attr('id', 'arrowhead')
.attr('refX', 17.5)
.attr('refY', 2)
.attr('markerWidth', 8)
.attr('markerHeight', 4)
.attr('orient', 'auto')
.attr('fill', '#ccc')
.append('path')
.attr('d', 'M 0,0 V 4 L6,2 Z');
var link = svg.selectAll('line')
.data(graphData.links)
.enter()
.append('line')
.attr('class', 'link')
.attr('marker-end', 'url(#arrowhead)');
// Create a group for each node
var node = svg.selectAll('g')
.data(graphData.nodes)
.enter()
.append('g');
// Color the node based on node's git-type (otherwise, hot pink!)
node.append('circle')
.attr('r', radius)
.attr('class', 'node')
.attr('fill', function(d) {
var blue = '#1BA1E2',
red = 'tomato',
green = '#5BB75B',
pink = '#FE57A1';
if (d.name.endsWith('.b')) { return red; }
if (d.name.endsWith('.t')) { return blue; }
if (d.name.endsWith('.c')) { return green; }
return pink;
});
node.append('text')
.attr('y', radius * 1.5)
.attr('text-anchor', 'middle')
.attr('fill', '#555')
.text(function(d) {
if (d.name.length > 10) {
return d.name.substring(0, 8) + '...';
}
return d.name;
});
// If the node has a type: tag it
node.append('text')
.attr('text-anchor', 'middle')
.attr('y', 4)
.attr('fill', 'white')
.attr('class', 'bold-text')
.text(function(d) {
if (d.name.endsWith('.b')) { return 'BLOB'; }
if (d.name.endsWith('.t')) { return 'TREE'; }
if (d.name.endsWith('.c')) { return 'COMMIT'; }
return '';
});
var charge = 700 * graphData.nodes.length;
var force = d3.layout.force()
.size([bounds.width, bounds.height])
.nodes(graphData.nodes)
.links(graphData.links)
.linkDistance(150)
.charge(-(charge))
.gravity(1)
.on('tick', tick);
// No fancy animation, tick amount varies based on number of nodes
force.start();
for (var i = 0; i < graphData.nodes.length * 100; ++i) force.tick();
force.stop();
function tick(e) {
// Push sources up and targets down to form a weak tree.
var k = -12 * e.alpha;
link
.each(function(d) { d.source.y -= k, d.target.y += k; })
.attr('x2', function(d) { return d.source.x; })
.attr('y2', function(d) { return d.source.y; })
.attr('x1', function(d) { return d.target.x; })
.attr('y1', function(d) { return d.target.y; });
node
.attr('transform', function(d) {
return 'translate(' + d.x + ',' + d.y + ')';
});
}
};
makeGraph('.merkle-1', forceInput);
You’d be forgiven for thinking that is a line.
This directional line is a DAG—albeit a simple one. Node B
depends on Node A
and that is the whole graph. If you want to get to Node B
then you have to start at Node A
. Depending on your problem-space, Node B
could be many things: A place in Königsberg, a target in a Makefile (or a Rakefile), or (brace yourself) a Git object.
Git Object Anatomy ¶
In order to understand how Git is a DAG, you need to understand Git “objects”:
$ mkdir merkle
$ cd merkle
$ echo 'This is the beginning' > README
$ git init
$ git add .
$ git -m 'Initial Commit'
$ find .git/objects/ -type f
.git/objects/1b/9f426a8407ffee551ad2993c5d7d3780296353
.git/objects/09/8e6de29daf4e55f83406b49f5768df9bc7d624
.git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
What are Git objects? Because they look like nonsense:
After a little digging through the Pro Git book, Git objects are a little less non-sensicle. Git objects are simply zlib
compressed, formatted messages:
$ python2 -c 'import sys,zlib; \
print zlib.decompress(sys.stdin.read());' \
< .git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
commit 195tree 098e6de29daf4e55f83406b49f5768df9bc7d624
author Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700
committer Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700
Initial Commit
Parts of that message are obvious: author
and committer
obviously come from my .gitconfig
. There is a Unix epoch timestamp with a timezone offset. commit
is the type of object. 195
is the byte-length of the remainder of the message.
There are a few parts of that message that aren’t immediately obvious. What is tree 098e6de29daf4e55f83406b49f5768df9bc7d624
? And why would we store this message in .git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
and not .git/objects/commit-message
? Is a merkle what I think it is? The answer to all of these questions and many more is the same: Cryptographic Hash Functions.
Hash Functions ¶
A cryptographic hash function is a function that when given an input of any length it creates a fixed-length output. Furthermore (and more importantly), the fixed-length output should be unique to a given input; any change in input will likely cause a big change in the output. Git uses a cryptographic hash function called Secure Hash Algorithm 1 (SHA-1).
You can play with the SHA-1 function on the command line:
$ echo 'message' | sha1sum
1133e3acf0a4cbb9d8b3bfd3f227731b8cd2650b -
$ echo 'message' | sha1sum
1133e3acf0a4cbb9d8b3bfd3f227731b8cd2650b -
$ echo 'message1' | sha1sum
c133514a60a4641b83b365d3dc7b715dc954e010 -
Note the big change in the output of sha1sum
from a tiny change in input. This is what cryptographic hash functions do.
Hash that DAG! ¶
Now that we have some idea of what is inside a commit object, let’s reverse-engineer the commit object from the HEAD
of our merkle
repo:
$ python2 -c 'import sys,zlib; \
print zlib.decompress(sys.stdin.read());' \
< .git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da | \
od -c
0000000 c o m m i t 1 9 5 \0 t r e e
0000020 0 9 8 e 6 d e 2 9 d a f 4 e 5 5
0000040 f 8 3 4 0 6 b 4 9 f 5 7 6 8 d f
0000060 9 b c 7 d 6 2 4 \n a u t h o r
0000100 T y l e r C i p r i a n i <
0000120 t c i p r i a n i @ w i k i m e
0000140 d i a . o r g > 1 4 5 8 6 0 4
0000160 1 2 0 - 0 7 0 0 \n c o m m i t
0000200 t e r T y l e r C i p r i a
0000220 n i < t c i p r i a n i @ w i
0000240 k i m e d i a . o r g > 1 4 5
0000260 8 6 0 4 1 2 0 - 0 7 0 0 \n \n I
0000300 n i t i a l C o m m i t \n \n
0000317
$ printf 'tree 098e6de29daf4e55f83406b49f5768df9bc7d62k4\n' >> commit-msg
$ printf 'author Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700\n' >> commit-msg
$ printf 'committer Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700\n' >> commit-msg
$ printf '\nInitial Commit\n' >> commit-msg
$ sha1sum <(cat \
<(printf "commit ") \
<(wc -c < commit-msg | tr -d '\n') \
<(printf '%b' '\0') commit-msg)
1a06ce381ac14f7a5baa1670691c2ff8a73aa6da /dev/fd/63
Hmm… that seems familiar
$ export COMMIT_HASH=$(sha1sum <(cat <(printf "commit ") <(wc -c < commit-msg | tr -d '\n') <(printf '%b' '\0') commit-msg) | cut -d' ' -f1)
$ find ".git/objects/${COMMIT_HASH:0:2}" -type f -name "${COMMIT_HASH:(-38)}"
.git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
The commit object is a zlib-compressed, formatted message that is stored in a file named after the SHA-1 hash of the file’s un-zlib
compressed contents.
(/me wipes brow)
Let’s use git-cat-file
to see if we can explore the tree 098e6de29daf4e55f83406b49f5768df9bc7d62k4
-part of the commit message object:
$ git cat-file -p 1a06ce381ac14f7a5baa1670691c2ff8a73aa6da
tree 098e6de29daf4e55f83406b49f5768df9bc7d624
author Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700
committer Tyler Cipriani <tcipriani@wikimedia.org> 1458604120 -0700
$ git cat-file -p 098e6de29daf4e55f83406b49f5768df9bc7d624
100644 blob 1b9f426a8407ffee551ad2993c5d7d3780296353 README
Hey that’s the text I put into README
!
So .git/HEAD
refers to .git/refs/heads/master
, calling git-cat-file
on the object found inside that file shows that it’s the commit object we recreated. The commit object points to 098e6de29daf4e55f83406b49f5768df9bc7d624
, which is a tree object with the contents: 100644 blob 1b9f426a8407ffee551ad2993c5d7d3780296353 README
The blob
object 1b9f426a8407ffee551ad2993c5d7d3780296353
is the contents of README
! So it seems each commit
object points to a tree
object that points to other objects.
What was I talking about? Merkle DAGs? D3.js? ¶
Let’s see if we can paste together what Git is doing at a low-level when we make a new commit:
- Take the contents of
README
, hash the contents using SHA-1, and store as ablob
object in.git/objects
. - Create a directory listing of the git working directory, listing each file, with its directory permissions and its hash value. Hash this directory listing and store as a
tree
in.git/objects
. - Take the commit message, along with info from
.gitconfig
and the hash of the top-level tree. Hash this information and store it as acommit
object in.git/objects
.
It seems that there may be a chain of dependencies:
var gitDag = {
// blob (add .b for blob)
'1b9f426a8407ffee551ad2993c5d7d3780296353.b': [],
// tree (.t == tree) is a hash that includes the hash from blob
'098e6de29daf4e55f83406b49f5768df9bc7d624.t': ['1b9f426a8407ffee551ad2993c5d7d3780296353.b'],
// commit (.c == commit) is a hash that includes the hash from tree
'1a06ce381ac14f7a5baa1670691c2ff8a73aa6da.c': ['098e6de29daf4e55f83406b49f5768df9bc7d624.t'],
};
makeGraph('.merkle-2', forceFormat(gitDag));
You’d be forgiven for thinking that is a line.
What’s really happening is that there is a commit
object (1a06ce38
) that depends on a tree
object (098e6de2
) that depends on a blob
(1b9f426a
).
Since it’s running each of these objects through a hash function and each of them contains a reference up the chain of dependencies, a minor change to either the blob
or the tree
will create a drastically different commit
object.
Applying a cryptographic hash function on top of a graph was Ralph Merkle’s big idea. This scheme makes magic possible. Transferring verifiable and trusted information through an untrusted medium is toatz for realz possible with Ralph’s little scheme.
The idea is that if you have the root-node hash, that is, the cryptographic hash of the node that depends on all other nodes (the commit
object in Git), and you obtained that root-node hash from a trusted source, you can trust all sub-nodes that stem from that root node if the hash of all those sub-root-nodes matches the root-node hash!
This is the mechanism by which things like Git, IPFS, Bitcoin, and BitTorrent are made possible: changing any one node in the graph changes all nodes that depend on that node all the way to the root-node (the commit
in Git).
Tales from the Merkle Graph ¶
I wrote a simple NodeJS script that creates a graph that is suitable for input into the JavaScript that I’ve already written that will create a D3.js force graph with whatever it finds in .git/objects
.
#!/usr/bin/env nodejs
/* makeDag - creates a JSON dependency graph from .git/objects */
var glob = require('glob'),
fs = require('fs'),
zlib = require('zlib');
var types = ['tree', 'commit', 'blob'],
treeRegex = {
// 100644 README\0[20 byte sha1]
regex: /[0-9]+\s[^\0]+\0((.|\n){20})/gm,
fn: function(sha) {
var buf = new Buffer(sha[1], 'binary');
return buf.toString('hex') + '.b';
}
},
commitRegex = {
// tree 098e6de29daf4e55f83406b49f5768df9bc7d624
regex: /(tree|parent)\s([a-f0-9]{40})/gm,
fn: function(sha) {
if (sha[1] === 'tree') {
return sha[2] + '.t';
}
return sha[2] + '.c';
}
},
total = 0,
final = {};
// determine file type, parse out SHA1s
var handleObjects = function(objData, name) {
types.forEach(function(type) {
var re, regex, match, key;
if (!objData.startsWith(type)) { return; }
key = name + '.' + type[0];
final[key] = [];
if (type === 'tree') { objType = treeRegex; }
if (type === 'commit') { objType = commitRegex; }
if (type === 'blob') { return; }
// Remove the object-type and size from file
objData = objData.split('\0');
objData.shift();
objData = objData.join('\0');
// Recursive regex match remainder
while ((match = objType.regex.exec(objData)) !== null) {
final[key].push(objType.fn(match));
}
});
// Don't output until you've got it all
if (Object.keys(final).length !== total) {
return;
}
// Output what ya got.
console.log(final);
};
// Readable object names not file names
var getName = function(file) {
var fileParts = file.split('/'),
len = fileParts.length;
return fileParts[len - 2] + fileParts[len - 1];
}
// Inflate the deflated git object file
var handleFile = function(file, out) {
var name = getName(file);
fs.readFile(file, function(e, data) {
zlib.inflate(data, function(e, data) {
if (e) { console.log(file, e); return; }
handleObjects(data.toString('binary'), name);
});
});
};
// Sort through the gitobjects directory
var handleFiles = function(files) {
files.forEach(function(file) {
fs.stat(file, function(e, f) {
if (e) { return; }
if (f.isFile()) {
// Don't worry about pack files for now
if (file.indexOf('pack') > -1) { return; }
total++;
handleFile(file);
}
});
});
};
(function() {
glob('.git/objects/**/*', function(e, f) {
if (e) { throw e; }
handleFiles(f);
});
})();
Merkle graph transformations are often difficult to describe, but easy to see. Using this last piece of code to create and view graphs for several repositories has been illuminating. The graph visualization both illuminates and challenges my understanding of Git in ways I didn’t anticipate.
The Tale of Commit Message Bike-shedding ¶
When you change your commit message, what happens to the graph? What depends on a commit? Where is the context for a commit?
$ git commit --amend -m 'This is the commit message now'
[master 585448a] This is the commit message now
Date: Mon Mar 21 16:48:40 2016 -0700
1 file changed, 1 insertion(+)
create mode 100644 README
$ find .git/objects -type f
.git/objects/1b/9f426a8407ffee551ad2993c5d7d3780296353
.git/objects/09/8e6de29daf4e55f83406b49f5768df9bc7d624
.git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
.git/objects/da/94af3a21ac7e0c875bbbe6162aa1d26d699c73
Now the DAG is a bit different:
var gitDag = { '098e6de29daf4e55f83406b49f5768df9bc7d624.t': [ '1b9f426a8407ffee551ad2993c5d7d3780296353.b' ],
'1a06ce381ac14f7a5baa1670691c2ff8a73aa6da.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ],
'1b9f426a8407ffee551ad2993c5d7d3780296353.b': [],
'da94af3a21ac7e0c875bbbe6162aa1d26d699c73.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ] }
makeGraph('.merkle-3', forceFormat(gitDag));
Here we see that there are now two commit
objects (1a06ce38
and da94af3a
) that both depend on a single tree
object (098e6de2
) that depends on a single blob
(1b9f426a
).
One of these commit objects will never be seen with git log
.
The Orphan Blob That Dared to Dream ¶
TIL: Git creates blob
objects as soon as a file is added to the staging area.
$ echo 'staged' > staged
$ find .git/objects -type f
.git/objects/1b/9f426a8407ffee551ad2993c5d7d3780296353
.git/objects/09/8e6de29daf4e55f83406b49f5768df9bc7d624
.git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
.git/objects/da/94af3a21ac7e0c875bbbe6162aa1d26d699c73
Notice that nothing depends on this object just yet. It’s a lonely orphan blob
.
$ git add staged
$ find .git/objects -type f
.git/objects/1b/9f426a8407ffee551ad2993c5d7d3780296353
.git/objects/09/8e6de29daf4e55f83406b49f5768df9bc7d624
.git/objects/1a/06ce381ac14f7a5baa1670691c2ff8a73aa6da
.git/objects/da/94af3a21ac7e0c875bbbe6162aa1d26d699c73
.git/objects/19/d9cc8584ac2c7dcf57d2680375e80f099dc481
$ makeDag
{ '098e6de29daf4e55f83406b49f5768df9bc7d624.t': [ '1b9f426a8407ffee551ad2993c5d7d3780296353.b' ],
'19d9cc8584ac2c7dcf57d2680375e80f099dc481.b': [],
'1a06ce381ac14f7a5baa1670691c2ff8a73aa6da.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ],
'da94af3a21ac7e0c875bbbe6162aa1d26d699c73.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ],
'1b9f426a8407ffee551ad2993c5d7d3780296353.b': [] }
Even unstaging and deleting the file doesn’t remove the object. Orphan objects in git are only garbage collected as part of git gc --prune
.
When this object is committed to the repo, it creates a whole new layer of the graph:
$ git commit -m 'Add staged file'
[master 4f407b3] Add staged file
1 file changed, 1 insertion(+)
create mode 100644 staged
$ makeDag
{ '098e6de29daf4e55f83406b49f5768df9bc7d624.t': [ '1b9f426a8407ffee551ad2993c5d7d3780296353.b' ],
'19d9cc8584ac2c7dcf57d2680375e80f099dc481.b': [],
'1a06ce381ac14f7a5baa1670691c2ff8a73aa6da.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ],
'1b9f426a8407ffee551ad2993c5d7d3780296353.b': [],
'4f407b396e6ecbb65de6cf192259c18ecd4d1e9b.c':
[ '7ce38101e91de29ee0fee3aa9940cc81159e0f8d.t',
'da94af3a21ac7e0c875bbbe6162aa1d26d699c73.c' ],
'7ce38101e91de29ee0fee3aa9940cc81159e0f8d.t':
[ '1b9f426a8407ffee551ad2993c5d7d3780296353.b',
'19d9cc8584ac2c7dcf57d2680375e80f099dc481.b' ],
'da94af3a21ac7e0c875bbbe6162aa1d26d699c73.c': [ '098e6de29daf4e55f83406b49f5768df9bc7d624.t' ] }
So we’ve created a new commit (4f407b39
) that is the parent of a different commit (da94af3a
) and a new tree (7ce38101
) that contains our old README
blob (1b9f426a
) and our new, previously orphaned, blob (19d9cc85
).
The Tale of Powerful Software ¶
I’ve always enjoyed the idea that software (and computer science more generally) is nothing but an abstraction to manage complexity. Good software— powerful software—like Git—is a software that manages an incredible amount of complexity and hides it completely from the user.
In recognition of this idea, I’ll leave you with the graph of my local copy of clippy—a small command line tool I created that is like man(1)
except it shows Clippy at the end of the man
output (yes, it’s dumb).
This should give you an idea of the complexity that is abstracted by the Git merkle graph: this repo contains 5 commits!