| 1 |
62 |
marcus.erl |
Kernel level exception handling in Linux 2.1.8
|
| 2 |
|
|
Commentary by Joerg Pommnitz
|
| 3 |
|
|
|
| 4 |
|
|
When a process runs in kernel mode, it often has to access user
|
| 5 |
|
|
mode memory whose address has been passed by an untrusted program.
|
| 6 |
|
|
To protect itself the kernel has to verify this address.
|
| 7 |
|
|
|
| 8 |
|
|
In older versions of Linux this was done with the
|
| 9 |
|
|
int verify_area(int type, const void * addr, unsigned long size)
|
| 10 |
|
|
function (which has since been replaced by access_ok()).
|
| 11 |
|
|
|
| 12 |
|
|
This function verified that the memory area starting at address
|
| 13 |
|
|
'addr' and of size 'size' was accessible for the operation specified
|
| 14 |
|
|
in type (read or write). To do this, verify_read had to look up the
|
| 15 |
|
|
virtual memory area (vma) that contained the address addr. In the
|
| 16 |
|
|
normal case (correctly working program), this test was successful.
|
| 17 |
|
|
It only failed for a few buggy programs. In some kernel profiling
|
| 18 |
|
|
tests, this normally unneeded verification used up a considerable
|
| 19 |
|
|
amount of time.
|
| 20 |
|
|
|
| 21 |
|
|
To overcome this situation, Linus decided to let the virtual memory
|
| 22 |
|
|
hardware present in every Linux-capable CPU handle this test.
|
| 23 |
|
|
|
| 24 |
|
|
How does this work?
|
| 25 |
|
|
|
| 26 |
|
|
Whenever the kernel tries to access an address that is currently not
|
| 27 |
|
|
accessible, the CPU generates a page fault exception and calls the
|
| 28 |
|
|
page fault handler
|
| 29 |
|
|
|
| 30 |
|
|
void do_page_fault(struct pt_regs *regs, unsigned long error_code)
|
| 31 |
|
|
|
| 32 |
|
|
in arch/i386/mm/fault.c. The parameters on the stack are set up by
|
| 33 |
|
|
the low level assembly glue in arch/i386/kernel/entry.S. The parameter
|
| 34 |
|
|
regs is a pointer to the saved registers on the stack, error_code
|
| 35 |
|
|
contains a reason code for the exception.
|
| 36 |
|
|
|
| 37 |
|
|
do_page_fault first obtains the unaccessible address from the CPU
|
| 38 |
|
|
control register CR2. If the address is within the virtual address
|
| 39 |
|
|
space of the process, the fault probably occurred, because the page
|
| 40 |
|
|
was not swapped in, write protected or something similar. However,
|
| 41 |
|
|
we are interested in the other case: the address is not valid, there
|
| 42 |
|
|
is no vma that contains this address. In this case, the kernel jumps
|
| 43 |
|
|
to the bad_area label.
|
| 44 |
|
|
|
| 45 |
|
|
There it uses the address of the instruction that caused the exception
|
| 46 |
|
|
(i.e. regs->eip) to find an address where the execution can continue
|
| 47 |
|
|
(fixup). If this search is successful, the fault handler modifies the
|
| 48 |
|
|
return address (again regs->eip) and returns. The execution will
|
| 49 |
|
|
continue at the address in fixup.
|
| 50 |
|
|
|
| 51 |
|
|
Where does fixup point to?
|
| 52 |
|
|
|
| 53 |
|
|
Since we jump to the contents of fixup, fixup obviously points
|
| 54 |
|
|
to executable code. This code is hidden inside the user access macros.
|
| 55 |
|
|
I have picked the get_user macro defined in include/asm/uaccess.h as an
|
| 56 |
|
|
example. The definition is somewhat hard to follow, so let's peek at
|
| 57 |
|
|
the code generated by the preprocessor and the compiler. I selected
|
| 58 |
|
|
the get_user call in drivers/char/console.c for a detailed examination.
|
| 59 |
|
|
|
| 60 |
|
|
The original code in console.c line 1405:
|
| 61 |
|
|
get_user(c, buf);
|
| 62 |
|
|
|
| 63 |
|
|
The preprocessor output (edited to become somewhat readable):
|
| 64 |
|
|
|
| 65 |
|
|
(
|
| 66 |
|
|
{
|
| 67 |
|
|
long __gu_err = - 14 , __gu_val = 0;
|
| 68 |
|
|
const __typeof__(*( ( buf ) )) *__gu_addr = ((buf));
|
| 69 |
|
|
if (((((0 + current_set[0])->tss.segment) == 0x18 ) ||
|
| 70 |
|
|
(((sizeof(*(buf))) <= 0xC0000000UL) &&
|
| 71 |
|
|
((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf)))))))
|
| 72 |
|
|
do {
|
| 73 |
|
|
__gu_err = 0;
|
| 74 |
|
|
switch ((sizeof(*(buf)))) {
|
| 75 |
|
|
case 1:
|
| 76 |
|
|
__asm__ __volatile__(
|
| 77 |
|
|
"1: mov" "b" " %2,%" "b" "1\n"
|
| 78 |
|
|
"2:\n"
|
| 79 |
|
|
".section .fixup,\"ax\"\n"
|
| 80 |
|
|
"3: movl %3,%0\n"
|
| 81 |
|
|
" xor" "b" " %" "b" "1,%" "b" "1\n"
|
| 82 |
|
|
" jmp 2b\n"
|
| 83 |
|
|
".section __ex_table,\"a\"\n"
|
| 84 |
|
|
" .align 4\n"
|
| 85 |
|
|
" .long 1b,3b\n"
|
| 86 |
|
|
".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *)
|
| 87 |
|
|
( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ;
|
| 88 |
|
|
break;
|
| 89 |
|
|
case 2:
|
| 90 |
|
|
__asm__ __volatile__(
|
| 91 |
|
|
"1: mov" "w" " %2,%" "w" "1\n"
|
| 92 |
|
|
"2:\n"
|
| 93 |
|
|
".section .fixup,\"ax\"\n"
|
| 94 |
|
|
"3: movl %3,%0\n"
|
| 95 |
|
|
" xor" "w" " %" "w" "1,%" "w" "1\n"
|
| 96 |
|
|
" jmp 2b\n"
|
| 97 |
|
|
".section __ex_table,\"a\"\n"
|
| 98 |
|
|
" .align 4\n"
|
| 99 |
|
|
" .long 1b,3b\n"
|
| 100 |
|
|
".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
|
| 101 |
|
|
( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err ));
|
| 102 |
|
|
break;
|
| 103 |
|
|
case 4:
|
| 104 |
|
|
__asm__ __volatile__(
|
| 105 |
|
|
"1: mov" "l" " %2,%" "" "1\n"
|
| 106 |
|
|
"2:\n"
|
| 107 |
|
|
".section .fixup,\"ax\"\n"
|
| 108 |
|
|
"3: movl %3,%0\n"
|
| 109 |
|
|
" xor" "l" " %" "" "1,%" "" "1\n"
|
| 110 |
|
|
" jmp 2b\n"
|
| 111 |
|
|
".section __ex_table,\"a\"\n"
|
| 112 |
|
|
" .align 4\n" " .long 1b,3b\n"
|
| 113 |
|
|
".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
|
| 114 |
|
|
( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err));
|
| 115 |
|
|
break;
|
| 116 |
|
|
default:
|
| 117 |
|
|
(__gu_val) = __get_user_bad();
|
| 118 |
|
|
}
|
| 119 |
|
|
} while (0) ;
|
| 120 |
|
|
((c)) = (__typeof__(*((buf))))__gu_val;
|
| 121 |
|
|
__gu_err;
|
| 122 |
|
|
}
|
| 123 |
|
|
);
|
| 124 |
|
|
|
| 125 |
|
|
WOW! Black GCC/assembly magic. This is impossible to follow, so let's
|
| 126 |
|
|
see what code gcc generates:
|
| 127 |
|
|
|
| 128 |
|
|
> xorl %edx,%edx
|
| 129 |
|
|
> movl current_set,%eax
|
| 130 |
|
|
> cmpl $24,788(%eax)
|
| 131 |
|
|
> je .L1424
|
| 132 |
|
|
> cmpl $-1073741825,64(%esp)
|
| 133 |
|
|
> ja .L1423
|
| 134 |
|
|
> .L1424:
|
| 135 |
|
|
> movl %edx,%eax
|
| 136 |
|
|
> movl 64(%esp),%ebx
|
| 137 |
|
|
> #APP
|
| 138 |
|
|
> 1: movb (%ebx),%dl /* this is the actual user access */
|
| 139 |
|
|
> 2:
|
| 140 |
|
|
> .section .fixup,"ax"
|
| 141 |
|
|
> 3: movl $-14,%eax
|
| 142 |
|
|
> xorb %dl,%dl
|
| 143 |
|
|
> jmp 2b
|
| 144 |
|
|
> .section __ex_table,"a"
|
| 145 |
|
|
> .align 4
|
| 146 |
|
|
> .long 1b,3b
|
| 147 |
|
|
> .text
|
| 148 |
|
|
> #NO_APP
|
| 149 |
|
|
> .L1423:
|
| 150 |
|
|
> movzbl %dl,%esi
|
| 151 |
|
|
|
| 152 |
|
|
The optimizer does a good job and gives us something we can actually
|
| 153 |
|
|
understand. Can we? The actual user access is quite obvious. Thanks
|
| 154 |
|
|
to the unified address space we can just access the address in user
|
| 155 |
|
|
memory. But what does the .section stuff do?????
|
| 156 |
|
|
|
| 157 |
|
|
To understand this we have to look at the final kernel:
|
| 158 |
|
|
|
| 159 |
|
|
> objdump --section-headers vmlinux
|
| 160 |
|
|
>
|
| 161 |
|
|
> vmlinux: file format elf32-i386
|
| 162 |
|
|
>
|
| 163 |
|
|
> Sections:
|
| 164 |
|
|
> Idx Name Size VMA LMA File off Algn
|
| 165 |
|
|
> 0 .text 00098f40 c0100000 c0100000 00001000 2**4
|
| 166 |
|
|
> CONTENTS, ALLOC, LOAD, READONLY, CODE
|
| 167 |
|
|
> 1 .fixup 000016bc c0198f40 c0198f40 00099f40 2**0
|
| 168 |
|
|
> CONTENTS, ALLOC, LOAD, READONLY, CODE
|
| 169 |
|
|
> 2 .rodata 0000f127 c019a5fc c019a5fc 0009b5fc 2**2
|
| 170 |
|
|
> CONTENTS, ALLOC, LOAD, READONLY, DATA
|
| 171 |
|
|
> 3 __ex_table 000015c0 c01a9724 c01a9724 000aa724 2**2
|
| 172 |
|
|
> CONTENTS, ALLOC, LOAD, READONLY, DATA
|
| 173 |
|
|
> 4 .data 0000ea58 c01abcf0 c01abcf0 000abcf0 2**4
|
| 174 |
|
|
> CONTENTS, ALLOC, LOAD, DATA
|
| 175 |
|
|
> 5 .bss 00018e21 c01ba748 c01ba748 000ba748 2**2
|
| 176 |
|
|
> ALLOC
|
| 177 |
|
|
> 6 .comment 00000ec4 00000000 00000000 000ba748 2**0
|
| 178 |
|
|
> CONTENTS, READONLY
|
| 179 |
|
|
> 7 .note 00001068 00000ec4 00000ec4 000bb60c 2**0
|
| 180 |
|
|
> CONTENTS, READONLY
|
| 181 |
|
|
|
| 182 |
|
|
There are obviously 2 non standard ELF sections in the generated object
|
| 183 |
|
|
file. But first we want to find out what happened to our code in the
|
| 184 |
|
|
final kernel executable:
|
| 185 |
|
|
|
| 186 |
|
|
> objdump --disassemble --section=.text vmlinux
|
| 187 |
|
|
>
|
| 188 |
|
|
> c017e785 xorl %edx,%edx
|
| 189 |
|
|
> c017e787 movl 0xc01c7bec,%eax
|
| 190 |
|
|
> c017e78c cmpl $0x18,0x314(%eax)
|
| 191 |
|
|
> c017e793 je c017e79f
|
| 192 |
|
|
> c017e795 cmpl $0xbfffffff,0x40(%esp,1)
|
| 193 |
|
|
> c017e79d ja c017e7a7
|
| 194 |
|
|
> c017e79f movl %edx,%eax
|
| 195 |
|
|
> c017e7a1 movl 0x40(%esp,1),%ebx
|
| 196 |
|
|
> c017e7a5 movb (%ebx),%dl
|
| 197 |
|
|
> c017e7a7 movzbl %dl,%esi
|
| 198 |
|
|
|
| 199 |
|
|
The whole user memory access is reduced to 10 x86 machine instructions.
|
| 200 |
|
|
The instructions bracketed in the .section directives are no longer
|
| 201 |
|
|
in the normal execution path. They are located in a different section
|
| 202 |
|
|
of the executable file:
|
| 203 |
|
|
|
| 204 |
|
|
> objdump --disassemble --section=.fixup vmlinux
|
| 205 |
|
|
>
|
| 206 |
|
|
> c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
|
| 207 |
|
|
> c0199ffa <.fixup+10ba> xorb %dl,%dl
|
| 208 |
|
|
> c0199ffc <.fixup+10bc> jmp c017e7a7
|
| 209 |
|
|
|
| 210 |
|
|
And finally:
|
| 211 |
|
|
> objdump --full-contents --section=__ex_table vmlinux
|
| 212 |
|
|
>
|
| 213 |
|
|
> c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................
|
| 214 |
|
|
> c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................
|
| 215 |
|
|
> c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................
|
| 216 |
|
|
|
| 217 |
|
|
or in human readable byte order:
|
| 218 |
|
|
|
| 219 |
|
|
> c01aa7c4 c017c093 c0199fe0 c017c097 c017c099 ................
|
| 220 |
|
|
> c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
|
| 221 |
|
|
^^^^^^^^^^^^^^^^^
|
| 222 |
|
|
this is the interesting part!
|
| 223 |
|
|
> c01aa7e4 c0180a08 c019a001 c0180a0a c019a004 ................
|
| 224 |
|
|
|
| 225 |
|
|
What happened? The assembly directives
|
| 226 |
|
|
|
| 227 |
|
|
.section .fixup,"ax"
|
| 228 |
|
|
.section __ex_table,"a"
|
| 229 |
|
|
|
| 230 |
|
|
told the assembler to move the following code to the specified
|
| 231 |
|
|
sections in the ELF object file. So the instructions
|
| 232 |
|
|
3: movl $-14,%eax
|
| 233 |
|
|
xorb %dl,%dl
|
| 234 |
|
|
jmp 2b
|
| 235 |
|
|
ended up in the .fixup section of the object file and the addresses
|
| 236 |
|
|
.long 1b,3b
|
| 237 |
|
|
ended up in the __ex_table section of the object file. 1b and 3b
|
| 238 |
|
|
are local labels. The local label 1b (1b stands for next label 1
|
| 239 |
|
|
backward) is the address of the instruction that might fault, i.e.
|
| 240 |
|
|
in our case the address of the label 1 is c017e7a5:
|
| 241 |
|
|
the original assembly code: > 1: movb (%ebx),%dl
|
| 242 |
|
|
and linked in vmlinux : > c017e7a5 movb (%ebx),%dl
|
| 243 |
|
|
|
| 244 |
|
|
The local label 3 (backwards again) is the address of the code to handle
|
| 245 |
|
|
the fault, in our case the actual value is c0199ff5:
|
| 246 |
|
|
the original assembly code: > 3: movl $-14,%eax
|
| 247 |
|
|
and linked in vmlinux : > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
|
| 248 |
|
|
|
| 249 |
|
|
The assembly code
|
| 250 |
|
|
> .section __ex_table,"a"
|
| 251 |
|
|
> .align 4
|
| 252 |
|
|
> .long 1b,3b
|
| 253 |
|
|
|
| 254 |
|
|
becomes the value pair
|
| 255 |
|
|
> c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
|
| 256 |
|
|
^this is ^this is
|
| 257 |
|
|
1b 3b
|
| 258 |
|
|
c017e7a5,c0199ff5 in the exception table of the kernel.
|
| 259 |
|
|
|
| 260 |
|
|
So, what actually happens if a fault from kernel mode with no suitable
|
| 261 |
|
|
vma occurs?
|
| 262 |
|
|
|
| 263 |
|
|
1.) access to invalid address:
|
| 264 |
|
|
> c017e7a5 movb (%ebx),%dl
|
| 265 |
|
|
2.) MMU generates exception
|
| 266 |
|
|
3.) CPU calls do_page_fault
|
| 267 |
|
|
4.) do page fault calls search_exception_table (regs->eip == c017e7a5);
|
| 268 |
|
|
5.) search_exception_table looks up the address c017e7a5 in the
|
| 269 |
|
|
exception table (i.e. the contents of the ELF section __ex_table)
|
| 270 |
|
|
and returns the address of the associated fault handle code c0199ff5.
|
| 271 |
|
|
6.) do_page_fault modifies its own return address to point to the fault
|
| 272 |
|
|
handle code and returns.
|
| 273 |
|
|
7.) execution continues in the fault handling code.
|
| 274 |
|
|
8.) 8a) EAX becomes -EFAULT (== -14)
|
| 275 |
|
|
8b) DL becomes zero (the value we "read" from user space)
|
| 276 |
|
|
8c) execution continues at local label 2 (address of the
|
| 277 |
|
|
instruction immediately after the faulting user access).
|
| 278 |
|
|
|
| 279 |
|
|
The steps 8a to 8c in a certain way emulate the faulting instruction.
|
| 280 |
|
|
|
| 281 |
|
|
That's it, mostly. If you look at our example, you might ask why
|
| 282 |
|
|
we set EAX to -EFAULT in the exception handler code. Well, the
|
| 283 |
|
|
get_user macro actually returns a value: 0, if the user access was
|
| 284 |
|
|
successful, -EFAULT on failure. Our original code did not test this
|
| 285 |
|
|
return value, however the inline assembly code in get_user tries to
|
| 286 |
|
|
return -EFAULT. GCC selected EAX to return this value.
|
| 287 |
|
|
|
| 288 |
|
|
NOTE:
|
| 289 |
|
|
Due to the way that the exception table is built and needs to be ordered,
|
| 290 |
|
|
only use exceptions for code in the .text section. Any other section
|
| 291 |
|
|
will cause the exception table to not be sorted correctly, and the
|
| 292 |
|
|
exceptions will fail.
|