2025-12-06 18:57:09 +00:00
|
|
|
---
|
|
|
|
|
tags:
|
|
|
|
|
- C
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
# The C compilation process
|
|
|
|
|
|
|
|
|
|
C code is compiled to a binary executable in four stages:
|
|
|
|
|
|
|
|
|
|
1. Preprocessing
|
|
|
|
|
2. Compilation
|
|
|
|
|
3. Assembly
|
|
|
|
|
4. Linking
|
|
|
|
|
|
|
|
|
|
To demonstrate the output at the different stages I will compile the following
|
|
|
|
|
simple program:
|
|
|
|
|
|
|
|
|
|
```c
|
|
|
|
|
#include stdio.h
|
|
|
|
|
|
|
|
|
|
int main (void)
|
|
|
|
|
{
|
|
|
|
|
printf("Hello world!");
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
For standard compilation when you don't need to see all the interim stages, you
|
|
|
|
|
just run the following in your source directory:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
gcc main.c
|
|
|
|
|
```
|
|
|
|
|
|
2025-12-06 19:30:18 +00:00
|
|
|
> `gcc` stands for GNU C compiler. Actually `gcc` is a driver that orchestrates
|
|
|
|
|
> muliple tools: `cc1`, the actual C-to-assembly compiler; `as` the assembler
|
|
|
|
|
> (assembly to object file); and `ld` the linker (linking the object files to
|
|
|
|
|
> the executable)
|
|
|
|
|
|
2025-12-06 18:57:09 +00:00
|
|
|
This generates:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
a.out main.c
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
`a.out` is the executable binary.
|
|
|
|
|
|
|
|
|
|
To run this code:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
./a.out
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
To compile to specified file name:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
gcc -o hello_world main.c
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Then to run:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
./hello_world
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
## Preprocessing
|
|
|
|
|
|
|
|
|
|
The processor finds all directives starting with `#` such as header file
|
|
|
|
|
`include` statements and adds them to your source code.
|
|
|
|
|
|
|
|
|
|
View this with:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
gcc -E main.c
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Here is an example for my script:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
extern char *ctermid (char *__s) __attribute__ ((__nothrow__ , __leaf__))
|
|
|
|
|
__attribute__ ((__access__ (__write_only__, 1)));
|
|
|
|
|
# 931 "/usr/include/stdio.h" 3 4
|
|
|
|
|
extern void flockfile (FILE *__stream) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern int ftrylockfile (FILE *__stream) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern void funlockfile (FILE *__stream) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
|
|
|
|
|
# 949 "/usr/include/stdio.h" 3 4
|
|
|
|
|
extern int __uflow (FILE *);
|
|
|
|
|
extern int __overflow (FILE *, int);
|
|
|
|
|
# 973 "/usr/include/stdio.h" 3 4
|
|
|
|
|
|
|
|
|
|
# 2 "main.c" 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 3 "main.c"
|
|
|
|
|
int main(void) { printf("Hello world"); }
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
## Compilation
|
|
|
|
|
|
|
|
|
|
Takes the pre-processed source code and translates it into assembly language for
|
|
|
|
|
your target architecture.
|
|
|
|
|
|
|
|
|
|
At this stage your code is assessed by the compiler for syntax errors and
|
|
|
|
|
optimisation.
|
|
|
|
|
|
|
|
|
|
The result is human-readable assembly in a file called `main.s`
|
|
|
|
|
|
|
|
|
|
Create this with:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
gcc -S main.c
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Here is the output for my script:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
.file "main.c"
|
|
|
|
|
.text
|
|
|
|
|
.section .rodata
|
|
|
|
|
.LC0:
|
|
|
|
|
.string "Hello world"
|
|
|
|
|
.text
|
|
|
|
|
.globl main
|
|
|
|
|
.type main, @function
|
|
|
|
|
main:
|
|
|
|
|
.LFB0:
|
|
|
|
|
.cfi_startproc
|
|
|
|
|
pushq %rbp
|
|
|
|
|
.cfi_def_cfa_offset 16
|
|
|
|
|
.cfi_offset 6, -16
|
|
|
|
|
movq %rsp, %rbp
|
|
|
|
|
.cfi_def_cfa_register 6
|
|
|
|
|
leaq .LC0(%rip), %rax
|
|
|
|
|
movq %rax, %rdi
|
|
|
|
|
movl $0, %eax
|
|
|
|
|
call printf@PLT
|
|
|
|
|
movl $0, %eax
|
|
|
|
|
popq %rbp
|
|
|
|
|
.cfi_def_cfa 7, 8
|
|
|
|
|
ret
|
|
|
|
|
.cfi_endproc
|
|
|
|
|
.LFE0:
|
|
|
|
|
.size main, .-main
|
|
|
|
|
.ident "GCC: (GNU) 15.2.1 20250813"
|
|
|
|
|
.section .note.GNU-stack,"",@progbits
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
## Assembly
|
|
|
|
|
|
|
|
|
|
The assembly language code is converted into machine code. The output is an
|
2025-12-06 19:30:18 +00:00
|
|
|
**object file** (`.o`) which contains the machine code but is not yet executable
|
|
|
|
|
because it is not yet linked to the functions and variables that come from
|
|
|
|
|
imported code. Your object file is not yet combined with the object files of the
|
|
|
|
|
libraries and resources you have used.
|
2025-12-06 18:57:09 +00:00
|
|
|
|
|
|
|
|
Create just the object file with:
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
gcc -c main.c
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
As it is a binary file it is not human-readable. However you can us `objdump` to
|
|
|
|
|
view a more intelligble representation of the output.
|
|
|
|
|
|
|
|
|
|
```sh
|
|
|
|
|
objdump -dS main.o
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
main.o: file format elf64-x86-64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Disassembly of section .text:
|
|
|
|
|
|
|
|
|
|
0000000000000000 <main>:
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
|
|
int main(void) { printf("Hello world"); }
|
|
|
|
|
0: 55 push %rbp
|
|
|
|
|
1: 48 89 e5 mov %rsp,%rbp
|
|
|
|
|
4: 48 8d 05 00 00 00 00 lea 0x0(%rip),%rax # b <main+0xb>
|
|
|
|
|
b: 48 89 c7 mov %rax,%rdi
|
|
|
|
|
e: b8 00 00 00 00 mov $0x0,%eax
|
|
|
|
|
13: e8 00 00 00 00 call 18 <main+0x18>
|
|
|
|
|
18: b8 00 00 00 00 mov $0x0,%eax
|
|
|
|
|
1d: 5d pop %rbp
|
|
|
|
|
1e: c3 ret
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
To break this down:
|
|
|
|
|
|
|
|
|
|
- Left: the offset (address within this object file)
|
|
|
|
|
- Middle: the actual bytes - this is what the CPU reads and executes
|
|
|
|
|
- Right: the human-readable assembly, which is just a translation of those bytes
|
|
|
|
|
|
|
|
|
|
> The assembly here is different to the assembly earlier in `main.s`. This is
|
|
|
|
|
> because this time it is being interpreted after it has been written to machine
|
|
|
|
|
> code. It's a translation back from machine code to assembly.
|
|
|
|
|
|
|
|
|
|
## Linking
|
|
|
|
|
|
|
|
|
|
In the final stage the object files are combined, resolving all the references
|
|
|
|
|
between them. The result of this stage will be the `a.out` file mentioned
|
|
|
|
|
earlier.
|
2025-12-06 19:30:18 +00:00
|
|
|
|
|
|
|
|
## Different architectures
|
|
|
|
|
|
|
|
|
|
By default `gcc` will compile to whichever architecture it is being run on.
|
|
|
|
|
|
|
|
|
|
As I am using Linux x86-64, I get x86 machine code.
|
|
|
|
|
|
|
|
|
|
It is possible - although not trivial - to compile to different architectures
|
|
|
|
|
which use different [instruction set](./Instruction_set_architectures.md').
|
|
|
|
|
Instructions sets other than the native machine you are running `gcc` on. This
|
|
|
|
|
is known as **cross-compiling**.
|
|
|
|
|
|
|
|
|
|
You might do this for ARM, say, and it would result in an ARM64 object file.
|