选择了Challenge 2:文件缓存的驱逐机制。
对MIT代码的修改
JOS Lab最后更新于2018年。5年来,GCC有了诸多更新,原有的JOS代码在新的GCC上会报错。在报告的这一部分,我将描述对原有MIT代码的修改,以使其通过现在GCC的编译检查。
修复变量重定义错误
在原有的代码中,struct Super *super和uint32_t *bitmap定义在fs/fs.h。然而fs/fs.h被多个源文件引用,这些源文件会分别编译,并链接到一起。这样,super和bitmap在链接时就会有多重定义。
我的解决方案是:把super和bitmap的定义从fs/fs.h移至fs/bc.c。而在fs/fs.c和fs/test.c,对super和bitmap的声明加extern。
绕过GCC的对齐检查
在lib/spawn.c的spawn()函数中,有这样一句:
if ((r = init_stack(child, argv, &child_tf.tf_esp)) < 0)
return r;然而,现在的GCC认为,对child_tf.tf.esp取地址会无法满足对齐要求(因为struct Trapframe以__attribute__((packed))声明)。
注意到JOS提供了offsetof宏。可以借此绕过对齐检查:
if ((r = init_stack(child, argv,
(uintptr_t *)(((char *)&child_tf)
+ offsetof(struct Trapframe, tf_esp)))) < 0)
return r;(题外话:一开始我没有写(char *),导致大量Page fault……所以指针加法一定要注意类型啊。)
Exercise 1
在env_create()中加入:
// If this is the file server (type == ENV_TYPE_FS) give it I/O privileges.
// LAB 5: Your code here.
new_env->env_tf.tf_eflags &= ~FL_IOPL_MASK;
if (type == ENV_TYPE_FS)
new_env->env_tf.tf_eflags |= FL_IOPL_3;
else
new_env->env_tf.tf_eflags |= FL_IOPL_0;当然,这样的代码只是为了可读性。实际上完全可以写成:
if (type == ENV_TYPE_FS)
new_env->env_tf.tf_eflags |= FL_IOPL_3;Question 1
不需要做任何事。因为上下文切换时会自动保存和恢复EFLAGS。
Exercise 2
在bc_pgfault()中加入:
// LAB 5: you code here:
addr = ROUNDDOWN(addr, PGSIZE);
r = sys_page_alloc(0, addr, PTE_P | PTE_U | PTE_W);
if (r < 0)
panic("`%s' error: %e.", __func__, r);
r = ide_read(blockno * BLKSECTS, addr, BLKSECTS);
if (r < 0)
panic("`%s' error: %e.", __func__, r);在flush_block()中加入:
// LAB 5: Your code here.
addr = ROUNDDOWN(addr, BLKSIZE);
if (!va_is_mapped(addr) || !va_is_dirty(addr))
return;
int ret = ide_write(blockno * BLKSECTS, addr, BLKSECTS);
if (ret < 0)
panic("`%s' error: %e.", __func__, ret);
ret = sys_page_map(0, addr, 0, addr, uvpt[PGNUM(addr)] & PTE_SYSCALL);
if (ret < 0)
panic("`%s' error: %e.", __func__, ret);Exercise 3
简单地线性搜索空闲块即可。
int alloc_block(void)
{
// The bitmap consists of one or more blocks. A single bitmap block
// contains the in-use bits for BLKBITSIZE blocks. There are
// super->s_nblocks blocks in the disk altogether.
// LAB 5: Your code here.
if (!super)
panic("no super block");
const uint32_t n_blocks = super->s_nblocks;
for (uint32_t i = 0; i < n_blocks; i++)
{
if (block_is_free(i))
{
bitmap[i / 32] &= ~(1 << (i % 32));
flush_block(bitmap);
return i;
}
}
return -E_NO_DISK;
}Exercise 4
file_block_walk()
先保证块号合法,此后检查直接块,最后检查间接块。必要时分配一个间接块。
static int file_block_walk(struct File *f, uint32_t filebno,
uint32_t **ppdiskbno, bool alloc)
{
// LAB 5: Your code here.
if (filebno >= NDIRECT + NINDIRECT)
return -E_INVAL;
if (filebno < NDIRECT)
{
*ppdiskbno = f->f_direct + filebno;
return 0;
}
filebno -= NDIRECT;
if (f->f_indirect == 0)
{
if (!alloc)
return -E_NOT_FOUND;
int new_indirect_block = alloc_block();
if (new_indirect_block < 0)
return new_indirect_block;
f->f_indirect = new_indirect_block;
memset(diskaddr(f->f_indirect), 0, BLKSIZE);
}
*ppdiskbno = (uint32_t *)diskaddr(f->f_indirect) + filebno;
return 0;
}file_get_block()
依靠file_block_walk()寻找对应的块号指针,必要时分配一个块,然后返回相应的地址。
int file_get_block(struct File *f, uint32_t filebno, char **blk)
{
// LAB 5: Your code here.
uint32_t *disk_block = NULL;
int r = file_block_walk(f, filebno, &disk_block, 1);
if (r < 0)
return r;
if (*disk_block == 0)
{
r = alloc_block();
if (r < 0)
return r;
*disk_block = r;
}
*blk = (char *)diskaddr(*disk_block);
#ifdef Lab_5_Challenge_2
r = buffer_visit();
if (r < 0)
return r;
#endif
return 0;
}Exercise 5
通过openfile_lookup()查看已经打开的对应文件,然后用file_read()写入。最后更新offset。
int serve_read(envid_t envid, union Fsipc *ipc)
{
struct Fsreq_read *req = &ipc->read;
struct Fsret_read *ret = &ipc->readRet;
if (debug)
cprintf("serve_read %08x %08x %08x\n", envid, req->req_fileid,
req->req_n);
// Lab 5: Your code here:
struct OpenFile *open_file = NULL;
int r = openfile_lookup(envid, req->req_fileid, &open_file);
if (r < 0)
return r;
r = file_read(open_file->o_file, ret->ret_buf, req->req_n,
open_file->o_fd->fd_offset);
if (r < 0)
return r;
open_file->o_fd->fd_offset += r;
return r;
}Exercise 6
serve_write()
仿照serve_read()实现即可。
int serve_write(envid_t envid, struct Fsreq_write *req)
{
if (debug)
cprintf("serve_write %08x %08x %08x\n", envid, req->req_fileid,
req->req_n);
// LAB 5: Your code here.
struct OpenFile *open_file = NULL;
int r = openfile_lookup(envid, req->req_fileid, &open_file);
if (r < 0)
return r;
r = file_write(open_file->o_file, req->req_buf, req->req_n,
open_file->o_fd->fd_offset);
if (r < 0)
return r;
open_file->o_fd->fd_offset += r;
return r;
}devfile_write()
仿照devfile_read()实现即可。
static ssize_t devfile_write(struct Fd *fd, const void *buf, size_t n)
{
// Make an FSREQ_WRITE request to the file system server. Be
// careful: fsipcbuf.write.req_buf is only so large, but
// remember that write is always allowed to write *fewer*
// bytes than requested.
// LAB 5: Your code here
assert(n <= sizeof(fsipcbuf.write.req_buf));
fsipcbuf.write.req_fileid = fd->fd_file.id;
fsipcbuf.write.req_n = n;
memcpy(fsipcbuf.write.req_buf, buf, n);
int r = fsipc(FSREQ_WRITE, NULL);
if (r < 0)
return r;
assert(r <= n);
return r;
}Exercise 7
设置段寄存器的权限位以及EFLAGS中的标记位即可。特别注意envid2env()的checkperm传入1。另外,在syscall()中添加case以正确分派SYS_env_set_trapframe。
static int sys_env_set_trapframe(envid_t envid, struct Trapframe *tf)
{
// LAB 5: Your code here.
// Remember to check whether the user has supplied us with a good
// address!
struct Env *env_ptr;
if (envid2env(envid, &env_ptr, 1) < 0)
return -E_BAD_ENV;
env_ptr->env_tf = *tf;
env_ptr->env_tf.tf_cs |= GD_UT | 3;
env_ptr->env_tf.tf_ss |= GD_UD | 3;
env_ptr->env_tf.tf_ds |= GD_UD | 3;
env_ptr->env_tf.tf_es |= GD_UD | 3;
env_ptr->env_tf.tf_eflags |= FL_IF;
env_ptr->env_tf.tf_eflags &= ~FL_IOPL_MASK;
env_ptr->env_tf.tf_eflags |= FL_IOPL_0;
return 0;
}Exercise 8
duppage()
这意味着判断!(pte & PTE_SHARE)才进行COW复制。此外,调用sys_page_map()时的perm也与Lab 4有区别。
static int duppage(envid_t envid, unsigned pn)
{
int r;
pte_t pte = uvpt[pn];
void *addr = (void *)(pn * PGSIZE);
if ((pte & (PTE_P | PTE_U)) != (PTE_P | PTE_U))
panic("`%s' error: pn %u is wrong.", __func__, pn);
if (!(pte & PTE_SHARE) && ((pte & PTE_W) || (pte & PTE_COW)))
{
r = sys_page_map(0, addr, envid, addr,
(pte & PTE_SYSCALL & ~PTE_W) | PTE_COW);
if (r < 0)
panic("`%s' error: %e.", __func__, r);
r = sys_page_map(0, addr, 0, addr,
(pte & PTE_SYSCALL & ~PTE_W) | PTE_COW);
if (r < 0)
panic("`%s' error: %e.", __func__, r);
}
else
{
r = sys_page_map(0, addr, envid, addr, pte & PTE_SYSCALL);
if (r < 0)
panic("`%s' error: %e.", __func__, r);
}
return 0;
}copy_shared_pages()
逐页映射即可;遇到未映射的页目录项可以直接跳过4 MB。
static int copy_shared_pages(envid_t child)
{
// LAB 5: Your code here.
uintptr_t address = 0;
while (address < UTOP)
{
if (uvpd[PDX(address)] & PTE_P)
{
pte_t pte = uvpt[PGNUM(address)];
if ((pte & (PTE_P | PTE_U | PTE_SHARE)) ==
(PTE_P | PTE_U | PTE_SHARE))
{
int r = sys_page_map(0, (void *)address, child,
(void *)address, pte & PTE_SYSCALL);
if (r < 0)
return r;
}
address += PGSIZE;
}
else
address += PTSIZE;
}
return 0;
}Exercise 9
只需要在trap_dispatch()中的switch()增加两个case:
case IRQ_OFFSET + IRQ_KBD:
kbd_intr();
lapic_eoi();
return;
case IRQ_OFFSET + IRQ_SERIAL:
serial_intr();
lapic_eoi();
return;Exercise 10
仿照输出重定向即可。注意Unix Style的输入文件描述符是0。
// LAB 5: Your code here.
if ((fd = open(t, O_RDONLY)) < 0)
{
cprintf("open %s for read: %e", t, fd);
exit();
}
if (fd != 0)
{
dup(fd, 0);
close(fd);
}Challenge 2
我添加了一个结构:struct buffer_block,用于标记在内存中分配地址的文件块。即,每一个内存中分配地址的文件块,会对应一个分配的struct buffer_block。
buffer_block至多有N_BUFFER个,通过空闲链表buffer_free和已分配链表buffer_used管理。
当已分配的buffer_block达到N_BUFFER个时,会尝试驱逐一些块。优先驱逐没有PTE_A的块。如果没有这样的块,则会优先驱逐最先加入已分配链表的块。
每次调用bc_pgfault()时,会为文件块分配一个新的内存块。这时,也会分配一个新的buffer_block。而每次file_get_block()时,会调用buffer_visit()。buffer_visit()会更新一个访问计数器;当访问计数器增加到一定值时,会清除所有的PTE_A。
这样,可以近似达到LRU的效果。接下来是代码:
// in fs.c
#define N_BUFFER 1024
#define N_VISIT 256
int n_visit = 0;
struct buffer_block
{
struct buffer_block *prev, *next;
void *buffer_ptr;
};
struct buffer_block buffers[N_BUFFER];
struct buffer_block buffer_free, buffer_used;
int n_buffer_used;
int buffer_alloc(void *ptr)
{
if (ptr < diskaddr(2))
return 0;
cprintf("new block\n");
if (n_buffer_used == N_BUFFER)
{
int r = buffer_evict();
if (r < 0)
return r;
}
struct buffer_block *new_block = buffer_free.next;
buffer_free.next->next->prev = &buffer_free;
buffer_free.next = buffer_free.next->next;
new_block->next = &buffer_used;
new_block->prev = buffer_used.prev;
buffer_used.prev->next = new_block;
buffer_used.prev = new_block;
new_block->buffer_ptr = ptr;
n_buffer_used++;
return 0;
}
int buffer_visit(void)
{
n_visit++;
if (n_visit < N_VISIT)
return 0;
n_visit = 0;
for (struct buffer_block *p = buffer_used.next; p != &buffer_used;
p = p->next)
{
flush_block(p->buffer_ptr);
int r = sys_page_map(0, p->buffer_ptr, 0, p->buffer_ptr,
uvpt[PGNUM(p->buffer_ptr)] & PTE_SYSCALL);
if (r < 0)
return r;
}
return 0;
}
int buffer_evict(void)
{
struct buffer_block *p = buffer_used.next;
while (p != &buffer_used)
{
struct buffer_block *p_next = p->next;
if (!(uvpt[PGNUM(p->buffer_ptr)] & PTE_A))
{
flush_block(p->buffer_ptr);
int r = sys_page_unmap(0, p->buffer_ptr);
if (r < 0)
return r;
p->prev->next = p->next;
p->next->prev = p->prev;
p->buffer_ptr = NULL;
p->prev = buffer_free.prev;
buffer_free.prev->next = p;
p->next = &buffer_free;
buffer_free.prev = p;
n_buffer_used--;
}
p = p_next;
}
if (n_buffer_used == N_BUFFER)
{
p = buffer_used.next;
flush_block(p->buffer_ptr);
int r = sys_page_unmap(0, p->buffer_ptr);
if (r < 0)
return r;
p->prev->next = p->next;
p->next->prev = p->prev;
p->buffer_ptr = NULL;
p->prev = buffer_free.prev;
buffer_free.prev->next = p;
p->next = &buffer_free;
buffer_free.prev = p;
n_buffer_used--;
}
return 0;
}
void buffer_init(void)
{
buffer_free.next = buffers;
buffers[0].prev = &buffer_free;
buffers[0].next = buffers + 1;
for (size_t i = 1; i < N_BUFFER - 1; i++)
{
buffers[i].prev = buffers + i - 1;
buffers[i].next = buffers + i + 1;
}
buffers[N_BUFFER - 1].prev = buffers + N_BUFFER - 2;
buffers[N_BUFFER - 1].next = &buffer_free;
buffer_free.prev = buffers + N_BUFFER - 1;
buffer_used.next = &buffer_used;
buffer_used.prev = &buffer_used;
}在file_get_block()后:
*blk = (char *)diskaddr(*disk_block);
// 以下是新添加的
r = buffer_visit();
if (r < 0)
return r;
// 以上是新添加的
return 0;free_block()后也可以从内存中解分配:
void free_block(uint32_t blockno)
{
// Blockno zero is the null pointer of block numbers.
if (blockno == 0)
panic("attempt to free zero block");
bitmap[blockno/32] |= 1<<(blockno%32);
// 以下是新添加的
int r = sys_page_unmap(0, diskaddr(blockno));
if (r < 0)
panic("`%s', error: %e.", __func__, r);
}在bc_pgfault()后要调用buffer_alloc():
// in bc.c, function `bc_pgfault()`
// Check that the block we read was allocated. (exercise for
// the reader: why do we do this *after* reading the block
// in?)
if (bitmap && block_is_free(blockno))
panic("reading free block %08x\n", blockno);
int buffer_alloc(void *ptr);
r = buffer_alloc(addr);
if (r < 0)
panic("`%s' error: %e", __func__, r);在bc_init()中调用buffer_init():
void bc_init(void)
{
void buffer_init(void);
buffer_init();
// ......
}在测试中,可以使用较小的N_BUFFER和N_VISIT(如16和4),以便在testfile的大文件测试中出现驱逐现象。
附录:以往的Challenge
在这一部分,我将回顾以往的Challenge,并添加补充说明。
Lab 1
Lab 1中,我选择了唯一的Challenge:JOS console的彩色输出。这部分代码在后续的Lab中无需改动。
Lab 2
Lab 2中,我选择了Challenge 1:内核部分虚拟内存使用4 MB大页,以及Challenge 2:虚拟内存映射相关的JOS monitor命令。
Lab 2 Challenge 1
在Lab 2 Challenge 1中,在映射内核部分的虚拟内存时,使用了4 MB大页(如果处理器支持的话)。这需要修改CR4寄存器的PSE标记位:
// in pmap.c, function `mem_init()`
uint32_t edx = 0;
cpuid(1, NULL, NULL, NULL, &edx);
int is_large_page_supported = (edx >> 3) & 1;
if (is_large_page_supported)
{
uint32_t cr4 = rcr4();
cr4 |= CR4_PSE;
lcr4(cr4);
is_large_page_enabled = 1; // 全局变量
boot_map_region_large_page(kern_pgdir, KERNBASE,
(size_t)((1ull << 32) - KERNBASE), 0, PTE_W);
}
else
{
boot_map_region(kern_pgdir, KERNBASE, (size_t)((1ull << 32) - KERNBASE),
0, PTE_W);
}而Lab 4启用了多处理器支持。因此,每个处理器核心都需要CR4寄存器的PSE标记位。在mp_main()中:
// in init.c, function `mp_main()`
// We are in high EIP now, safe to switch to kern_pgdir
// `is_large_page_enabled` 是 `pmap.c` 中的全局变量
extern int is_large_page_enabled;
if (is_large_page_enabled)
{
uint32_t cr4 = rcr4();
cr4 |= CR4_PSE;
lcr4(cr4);
}
lcr3(PADDR(kern_pgdir));Lab 2 Challenge 2
这部分代码实在没有用处,因此,自Lab 3起,我删除了这部分代码。
Lab 3
Lab 3中,我选择了Challenge 2:JOS monitor的继续与单步运行命令与Challenge 3:使用sysenter与sysexit指令实现快速系统调用。
Lab 3 Challenge 2
这部分代码无需改动。
Lab 3 Challenge 3
在Lab 3 Challenge 3中,我使用sysenter和sysexit指令实现了快速系统调用。
// in trap.c, function `trap_init()`
#define IA32_SYSENTER_CS 0x174
#define IA32_SYSENTER_ESP 0x175
#define IA32_SYSENTER_EIP 0x176
void fast_system_call();
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_CS), "d"(0),
"a"(GD_KT));
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_ESP), "d"(0),
"a"(KSTACKTOP));
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_EIP), "d"(0),
"a"(fast_system_call));// in lib/syscall.c, function `syscall()`
switch (num)
{
case SYS_cputs: case SYS_cgetc: case SYS_getenvid: case SYS_env_destroy:
asm volatile("pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"leal 114514f, %%esi\n"
"sysenter\n"
"114514:\n"
"popl %%ebp\n"
: "=a"(ret)
: "a"(num), "d"(a1), "c"(a2), "b"(a3), "D"(a4)
: "%esi", "memory");
break;
default:
// 保留原有的,基于 int $0x30 的系统调用方法
// (省略代码)
}// in trapentry.S
.global fast_system_call
.type fast_system_call, @function
.align 2
fast_system_call:
pushl $0
pushl %edi
pushl %ebx
pushl %ecx
pushl %edx
pushl %eax
call syscall
movl %esi, %edx
movl %ebp, %ecx
sysexit而Lab 4启用了多处理器支持,需要对每个处理器内核初始化msr寄存器,并设置单独的内核栈。为此,把原本的asm volatile("wrmsr" ...从trap_init()移到trap_init_percpu():
// in trap.c, function `trap_init_percpu()`
#define IA32_SYSENTER_CS 0x174
#define IA32_SYSENTER_ESP 0x175
#define IA32_SYSENTER_EIP 0x176
void fast_system_call();
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_CS), "d"(0),
"a"(GD_KT));
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_ESP), "d"(0),
"a"(KSTACKTOP -
current_cpuid * (KSTKSIZE + KSTKGAP)));
asm volatile("wrmsr" : : "c"(IA32_SYSENTER_EIP), "d"(0),
"a"(fast_system_call));需要在进入内核时加锁。为此,在kern/syscall.c中新增一个函数:
int32_t syscall_fast(uint32_t syscallno, uint32_t a1, uint32_t a2, uint32_t a3,
uint32_t a4, uint32_t a5)
{
lock_kernel();
int32_t ret = syscall(syscallno, a1, a2, a3, a4, a5);
unlock_kernel();
return ret;
}并将trapentry.S中的call syscall改成call syscall_fast。
在Lab 4中,使用了时钟中断。注意到sysenter指令会把EFLAGS寄存器的IF位清零以屏蔽外部中断,但sysexit指令却不会复位IF。因此,在sysexit之前,需要sti。
// in trapentry.S
.global fast_system_call
.type fast_system_call, @function
.align 2
fast_system_call:
pushl $0
pushl %edi
pushl %ebx
pushl %ecx
pushl %edx
pushl %eax
call syscall_fast // 现在调用syscall_fast
movl %esi, %edx
movl %ebp, %ecx
sti // sti以复位IF
sysexit那么,哪些系统调用函数可以通过快速系统调用实现呢?注意到sysexit指令总是回到调用点,因此,sys_yield(),以及其他涉及到yield的系统调用,如sys_ipc_recv(),不能使用快速系统调用。(还有另外一个原因:我做了Lab 4的Challenge 3,因此sys_yield()需要保存浮点状态,但是快速系统调用不会做这一点)
sys_env_set_trapframe()可能会修改EIP和/或ESP,它也不能用快速系统调用实现。
sys_page_map()也不能通过快速系统调用实现,因为它有5个参数,而快速系统调用至多支持4个参数。
因为sys_exofork()直接在inc/lib.h内联定义,为了保证代码的简单性,它也无法通过快速系统调用实现。
最终,修改lib/syscall.c中的syscall()如下:
switch (num)
{
case SYS_cputs: case SYS_cgetc: case SYS_getenvid: case SYS_page_alloc:
case SYS_page_unmap: case SYS_env_set_status: case SYS_env_set_pgfault_upcall:
case SYS_ipc_try_send:
asm volatile("pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"leal 114514f, %%esi\n"
"sysenter\n"
"114514:\n"
"popl %%ebp\n"
: "=a"(ret)
: "a"(num), "d"(a1), "c"(a2), "b"(a3), "D"(a4)
: "%esi", "cc", "memory");
break;
default:
asm volatile("int %1\n"
: "=a"(ret)
: "i"(T_SYSCALL), "a"(num), "d"(a1), "c"(a2), "b"(a3),
"D"(a4), "S"(a5)
: "cc", "memory");
}注意,考虑到系统调用可能改变条件码,我对clobber加上了"cc",这是Lab 3中我没有做的。
Lab 4
在Lab 4中,我选择了Challenge 3:使用fxrstor和fxsave指令进行浮点状态的保存和恢复。这部分代码无需改动。
现在,我的JOS Lab可以将Lab 1 Challenge 1、Lab 2 Challenge 1、Lab 3 Challenge 2 & 3、Lab 4 Challenge 3和Lab 5 Challenge 2全部启用的情况下,在Lab 5获得满分(喜)。