Allocating before you know the size

Hi guys,

So I'm having a crack at writing a compiler to couple HTML/CSS/JS in a cleaner and nicer way.

My full-time job is writing HTML/CSS/PHP so the only C/C++ practice I get is from HH.

I'm using the simple_preprocessor.cpp from Handmade Hero as the base, and I'm just looking for some criticism on how I've implemented these functions in my code.

Would it be better to just allocate the 255 slots up-front and just use them? Or temporary allocate 255 (or more) slots (as Im doing now) and just copy it out? Or lex once to get the parameter count, then lex again, but this time store the data? It's a tough call to make just because I'm going to be doing -something- like this a lot during this project so I want to do things in a sane way and avoid rewriting it in the future, so I'd like to know the caveats of each method (or if I'm just not seeing a much simpler solution)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
internal AST_Parameters* parseParameters(Tokenizer* tokenizer) {
	TemporaryPoolScope tempPool; // NOTE(Jake): Same as TemporaryPool in HH but immediately uses a g_allocator variable and resets when it goes out of scope
        // Nobody would have more than 255 parameters in a function surely.
	Array<Token> parameters(255, tempPool);
	Array<Token> values(255, tempPool);
	
	for (s32 parameterCount = 0;;++parameterCount)
	{
		Token left = getToken(tokenizer);
		if (left.type == TOKEN_PAREN_CLOSE)
		{
			break;
		}
		if (!requireToken(left, TOKEN_IDENTIFIER))
		{
			return NULL;
		}
		if (!requireToken(tokenizer, TOKEN_EQUAL))
		{
			return NULL;
		}

		Token right = getToken(tokenizer);
		if (!(right.type == TOKEN_IDENTIFIER 
			|| right.type == TOKEN_STRING))
		{
			lexError(right, "Parameter %d is invalid. Expected identifier or string, instead got '%*.*s'", parameterCount, right.length, right.length, right.data);
			return NULL;
		}

		parameters.push(left);
		values.push(right);
	}

	if (parameters.used == 0)
	{
		return NULL;
	}

	AST_Parameters* ast = pushStructCurrent(AST_Parameters);
	ast->type = AST_PARAMETERS;
	ast->values = pushArrayStructCurrent(Token, values.used);
	ast->parameters = pushArrayStructCurrent(Token, parameters.used);
	for(s32 i = 0; i < values.used; ++i)
	{
		ast->values[i] = values.data[i];
		ast->parameters[i] = values.data[i];
	}
	return ast;
}


For this function again, not sure what the best approach is. I'm guessing FindFirstFile() is slow each call (hitting the hard drive) so I probably shouldn't scan the directories twice right? So I don't know what an ideal way is to estimate the allocation size for these strings.

To give some context, this will probably only need to be called once at the start of the app (you provide a directory, the compiler will handle the rest from there)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
StringLinkedList getFilesRecursive(String directory, Error* error)
	{
		wchar_t szBaseDir[MAX_PATH];
		mbstowcs(szBaseDir, directory.data, MAX_PATH);

		const u32 directoryLimit = 100;
		wchar_t szDirectories[directoryLimit][MAX_PATH+10];
		u32 directories = 0;
		u32 files = 0;
		StringCchCopy(szDirectories[directories++], MAX_PATH, szBaseDir);

		StringLinkedList list = {};
		while (directories > 0)
		{
			--directories;

			wchar_t currentDirectory[MAX_PATH];
			StringCchCopy(currentDirectory, MAX_PATH, szDirectories[directories]);

			wchar_t currentDirectoryWSearch[MAX_PATH];
			StringCchCopy(currentDirectoryWSearch, MAX_PATH, currentDirectory);
			StringCchCat(currentDirectoryWSearch, MAX_PATH, TEXT("\\*"));

			WIN32_FIND_DATA ffd;
			HANDLE hFind = FindFirstFile(currentDirectoryWSearch, &ffd);
			DWORD dLastError = GetLastError();
			if (hFind == INVALID_HANDLE_VALUE || dLastError == ERROR_FILE_NOT_FOUND) 
			{
				if (error)
				{
					if (dLastError == ERROR_FILE_NOT_FOUND)
					{
						error->errorCode = DIRECTORY_NO_FILES;
					}
					else
					{
						error->errorCode = DIRECTORY_INVALID_DIR;
					}
				}
				StringLinkedList list_null = {};
				return list_null;
			}

			do
			{
				if (wcscmp(ffd.cFileName, TEXT(".")) != 0 && wcscmp(ffd.cFileName, TEXT("..")) != 0)
				{
					if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
					{
						// Push this directory to be searched next iteration.
						StringCchCopy(szDirectories[directories], MAX_PATH, currentDirectory);
						StringCchCat(szDirectories[directories], MAX_PATH, TEXT("\\"));
						StringCchCat(szDirectories[directories], MAX_PATH, ffd.cFileName);
						++directories;
					}
					else
					{
						wchar_t filepath[MAX_PATH];
						// todo(Jake): Check if .fel file
						StringCchCopy(filepath, ArrayCount(filepath), currentDirectory);
						StringCchCat(filepath, ArrayCount(filepath), TEXT("\\"));
						StringCchCat(filepath, ArrayCount(filepath), ffd.cFileName);
						
						String string = WcharToUTF8String(filepath);
						list.add(string);
						++files;
					}
				}
			}
			while (FindNextFile(hFind, &ffd) != 0);
			FindClose(hFind);
		}
		return list;

inline String WcharToUTF8String(LPCWSTR string) {
	String result = {};
	u32 stringLength = wcslen(string);
	result.length = WideCharToMultiByte(CP_UTF8, 0, string, stringLength, NULL, 0, NULL, NULL);
	result.data = (char*)pushSizeCurrent(result.length + 1);
	WideCharToMultiByte(CP_UTF8, 0, string, stringLength, result.data, result.length, NULL, NULL);
	return result;
}


If I'm missing any info that you guys need/want for further criticism, I'll provide. I just didn't want this post becoming too dense.
If you're writing a thing that runs once and then closes, memory usage just really isn't that much of a concern. Really. Until you're allocating gigabytes at a time, it's just not that important. (Even then, allocating memory pages you don't actually touch is... not free, but pretty cheap.)

In general, though, you have two options:

a) Pick a reasonable number. What's the common case? If you want to do it 'right', don't just guess, gather some stats. (This can be as simple as printing the count to the console on every call -- throw the result in a spreadsheet and see what it tells you.) If you exceed that size in practice, just reallocate a larger buffer.

b) Define a maximum number, allocate that much, assert if you exceed that number. If there's a clear number to use for this (defined in a specification or something), it's the simpler option.

If you're more familiar with scripting languages, it's easy to forget how *ridiculously* fast computers are. A "slow" operation only matters in context.
Are those function arguments you are parsing? Just hardcoding max arguments for function with some reasonable number is perfectly fine. I would actually choose something smaller, like 32 instead of 256. And allocating 32 entries up front is perfectly reasonable thing. Just assert when you reach max count, don't try to process stupid code (more than X arguments).

As for file handling - when scanning twice you shouldn't be worried about performance. Instead you should be worried about correctness. What if new files get added between your scans - now second loop will need to process more items than expected. And what if file get deleted? Then there will be fewer items (which is not such a big deal than the first case).

I solve this by using linked lists. Instead of putting items in one big array, I put in many smaller arrays linked together. The code looks like this:
1
2
3
4
5
6
struct Bucket
{
    Item* items;
    u32 count;
    Bucket* next;
};


Then the code that collects files looks something like this:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Bucket* Collect()
{
    Bucket* result = Allocate(sizeof(Bucket));
    const u32 MaxItemsPerBucket = 1024;

    Bucket* current = result;
    current->items = Allocate(MaxItemsPerBucket * sizeof(Item))
    current->count = 0;
    current->next = 0;
    while (...more files...)
    {
        if (current->count == MaxItemsPerBucket)
        {
            Bucket* next = Allocate(sizeof(Bucket));
            next->items = Allocate(MaxItemsPerBucket * sizeof(Item))
            next->count = 0;
            next->next = 0;
            current->next = next;
            current = next;
        }
        Item* item = current->items[current->count++];
        item->name = ...;
        item->whatever = ...;
    }
    return result;
}


Looping such linked-array is pretty trivial:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
ProcessItems(Bucket* bucket)
{
    while (bucket)
    {
        for (u32 i=0; i<bucket->count; i++)
        {
            Item* item = bucket->items + i;
            ... do something with item ...
        }
        bucket = bucket->next;
    }
}


This way it doesn't matter if you don't know ahead of time how many items are in folder. It will allocate 1024 items at a time, which is not a big deal. You can even control the size per bucket dynamically (for example, for first bucket allocate only 32 elements, for second 64, for next 128, and so on.. up to 4096, and then keep it max 4096 per bucket).

And if you allocate memory from arena like it is done in Handmade Hero, then you can see how easy is to deallocate this structure. Just do BeginTemporaryMemory before and EndTemporaryMemory after.

If you want all items in one contiguous array, then it is pretty trivial to count how many items are in total, allocate one contiguous memory for new array and then memcpy from buckets to new array.

Edited by Mārtiņš Možeiko on
Thanks a lot for your advice guys, I've opted to only parse 32 function parameters for now like so:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#define MAX_PARAMETER_COUNT 32
internal AST_Parameters* parseParameters(Tokenizer* tokenizer) {
	Array<Token>* parameters = NULL;
	Array<Token>* values = NULL;
	
	for (s32 parameterCount = 0;;++parameterCount)
	{
		Token left = getToken(tokenizer);
		if (left.type == TOKEN_PAREN_CLOSE)
		{
			break;
		}
		else if (left.type == TOKEN_COMMA)
		{
			// Get identifier
			left = getToken(tokenizer);
		}
		if (!requireToken(left, TOKEN_IDENTIFIER))
		{
			return NULL;
		}
		if (!requireToken(tokenizer, TOKEN_EQUAL))
		{
			return NULL;
		}

		Token right = getToken(tokenizer);
		if (!(right.type == TOKEN_IDENTIFIER 
			|| right.type == TOKEN_STRING))
		{
			lexError(right, "Parameter %d is invalid. Expected identifier or string, instead got '%*.*s'", parameterCount, right.length, right.length, right.data);
			return NULL;
		}

		if (parameters == NULL && values == NULL)
		{
			parameters = Array<Token>::create(MAX_PARAMETER_COUNT);
			values = Array<Token>::create(MAX_PARAMETER_COUNT);
		}

		if (parameters->used == parameters->size)
		{
			lexError(right, "Maximum allowed parameters for function or component is %d.", MAX_PARAMETER_COUNT);
			return NULL;
		}

		parameters->push(left);
		values->push(right);
	}

	if (parameters == NULL || values == NULL)
	{
		return NULL;
	}

	AST_Parameters* ast = pushStructCurrent(AST_Parameters);
	ast->type = AST_PARAMETERS;
	ast->values = values;
	ast->parameters = parameters;
	return ast;
}
#undef MAX_PARAMETER_COUNT


As for the file handling stuff, I'll probably rewrite that to be more robust in the future when this thing needs to be used in the real world, but the explaination of buckets was great, so thank you.